From 2e7ce3bb6bf0f5f2b9e76bbb7bcb5aa93feb2f96 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Wed, 25 May 2022 01:19:12 +0800 Subject: [PATCH] Overhaul to Reducton and its dependants (#237) * Tiny fix in dynamic_buffer.hpp to support vectorized AtomicAdd for double type * Update to host layer and host reduction * Merge and remove reduction kernels * Merge and remove reduction device interfaces and update pooling device interface * Merge and remove useless reduction device instances * Update to reduction profiler and reduction ctests * Update to reduction and pooling examples and add one reduction example * Change to reduction examples to let them testable by ctest * Add explicit pass checking for reduction and pooling examples * Explicit assignment of tensor shapes in example reduce_blockwise_two_call * Use atomic_add to repace atomicAdd and add atomic_add for double type * Add reduce ctest support for double data type * Replace to_int_vector() by using c++ std::vector::assign() * Keep DeviceReduceThreadWise separated from DeviceReduceBlockWise * Merge DeviceReduceBlockWise and DeviceReduceMultiBlockAtomicAdd into DeviceReduceMultiBlock * Add GetAtomicOperationZeroValue() support for AtomicMax * Tiny change to reduce example README.md * Fix some tiny issues due to branch merging * Revoke previous change in dynamic_buffer.hpp and add atomic_add for double2_t * Add reduce multiblock_atomic_add instances for fp64 to verify vectorized atomic_add on fp64 * Renaming * Clean the header includings in device_reduce instances header files [ROCm/composable_kernel commit: 63eee2d9991b08ca286f6895dd8f90da12a62da3] --- example/12_reduce/CMakeLists.txt | 3 +- example/12_reduce/README.md | 41 +- example/12_reduce/reduce_blockwise.cpp | 188 ++-- .../12_reduce/reduce_blockwise_two_call.cpp | 290 ++++++ example/13_pool2d_fwd/README.md | 10 +- example/13_pool2d_fwd/pool2d_fwd.cpp | 114 ++- .../device/device_pool2d_fwd_nhwc_nhwc.hpp | 45 +- .../gpu/device/device_reduce.hpp | 29 +- .../gpu/device/device_reduce_blockwise.hpp | 374 -------- .../device_reduce_blockwise_second_call.hpp | 328 ------- .../gpu/device/device_reduce_common.hpp | 18 +- ...c_add.hpp => device_reduce_multiblock.hpp} | 311 +++--- ...evice_reduce_multiblock_partial_reduce.hpp | 440 --------- .../gpu/device/device_reduce_threadwise.hpp | 145 ++- .../grid/gridwise_2d_reduction_blockwise.hpp | 886 ------------------ .../grid/gridwise_2d_reduction_multiblock.hpp | 638 +++++++++++++ ...ise_2d_reduction_multiblock_atomic_add.hpp | 269 ------ ...2d_reduction_multiblock_partial_reduce.hpp | 487 ---------- .../grid/gridwise_2d_reduction_threadwise.hpp | 371 ++++---- include/ck/utility/dynamic_buffer.hpp | 2 +- .../utility/generic_memory_space_atomic.hpp | 23 + include/ck/utility/reduction_operator.hpp | 58 +- .../library/host_tensor/host_common_util.hpp | 102 ++ .../library/host_tensor/host_reduce_util.hpp | 26 +- .../ck/library/host_tensor/host_reduction.hpp | 18 +- .../gpu/reduce/device_reduce_instance.hpp | 17 +- .../device_reduce_instance_blockwise.hpp | 148 +-- ..._reduce_instance_blockwise_b16_f32_b16.hpp | 3 +- ..._reduce_instance_blockwise_f16_f16_f16.hpp | 3 +- ..._reduce_instance_blockwise_f16_f32_f16.hpp | 3 +- ..._reduce_instance_blockwise_f32_f32_f32.hpp | 2 - ..._reduce_instance_blockwise_f32_f64_f32.hpp | 2 - ..._reduce_instance_blockwise_f64_f64_f64.hpp | 2 - ...ce_reduce_instance_blockwise_i8_i32_i8.hpp | 2 - ...ice_reduce_instance_blockwise_i8_i8_i8.hpp | 2 - ..._reduce_instance_blockwise_second_call.hpp | 165 ---- ...ance_blockwise_second_call_f16_f16_f16.hpp | 47 - ...ance_blockwise_second_call_f32_f32_b16.hpp | 60 -- ...ance_blockwise_second_call_f32_f32_f16.hpp | 35 - ...ance_blockwise_second_call_f32_f32_f32.hpp | 59 -- ...ance_blockwise_second_call_f64_f64_f32.hpp | 35 - ...ance_blockwise_second_call_f64_f64_f64.hpp | 59 -- ...tance_blockwise_second_call_i32_i32_i8.hpp | 31 - ...nstance_blockwise_second_call_i8_i8_i8.hpp | 47 - .../device_reduce_instance_impl_common.hpp | 14 - ..._reduce_instance_multiblock_atomic_add.hpp | 123 +-- ...ance_multiblock_atomic_add_b16_f32_f32.hpp | 3 +- ...ance_multiblock_atomic_add_f16_f32_f32.hpp | 3 +- ...ance_multiblock_atomic_add_f32_f32_f32.hpp | 2 - ...ance_multiblock_atomic_add_f32_f64_f32.hpp | 2 - ...ance_multiblock_atomic_add_f64_f64_f64.hpp | 29 + ...uce_instance_multiblock_partial_reduce.hpp | 174 ---- ..._multiblock_partial_reduce_b16_f32_b16.hpp | 60 -- ..._multiblock_partial_reduce_f16_f16_f16.hpp | 47 - ..._multiblock_partial_reduce_f16_f32_f16.hpp | 35 - ..._multiblock_partial_reduce_f32_f32_f32.hpp | 52 - ..._multiblock_partial_reduce_f32_f64_f32.hpp | 27 - ..._multiblock_partial_reduce_f64_f64_f64.hpp | 62 -- ...ce_multiblock_partial_reduce_i8_i32_i8.hpp | 31 - ...nce_multiblock_partial_reduce_i8_i8_i8.hpp | 47 - .../device_reduce_instance_threadwise.hpp | 75 +- ...reduce_instance_threadwise_b16_f32_b16.hpp | 3 +- ...reduce_instance_threadwise_f16_f16_f16.hpp | 3 +- ...reduce_instance_threadwise_f16_f32_f16.hpp | 3 +- ...reduce_instance_threadwise_f32_f32_f32.hpp | 2 - ...reduce_instance_threadwise_f32_f64_f32.hpp | 2 - ...reduce_instance_threadwise_f64_f64_f64.hpp | 2 - ...e_reduce_instance_threadwise_i8_i32_i8.hpp | 2 - ...ce_reduce_instance_threadwise_i8_i8_i8.hpp | 2 - .../gpu/reduce/CMakeLists.txt | 17 +- ...ance_blockwise_second_call_f16_f16_f16.cpp | 40 - ...ance_blockwise_second_call_f32_f32_b16.cpp | 53 -- ...ance_blockwise_second_call_f32_f32_f16.cpp | 28 - ...ance_blockwise_second_call_f32_f32_f32.cpp | 52 - ...ance_blockwise_second_call_f64_f64_f32.cpp | 28 - ...ance_blockwise_second_call_f64_f64_f64.cpp | 52 - ...tance_blockwise_second_call_i32_i32_i8.cpp | 24 - ...nstance_blockwise_second_call_i8_i8_i8.cpp | 40 - ...ance_multiblock_atomic_add_f64_f64_f64.cpp | 24 + ..._multiblock_partial_reduce_b16_f32_b16.cpp | 53 -- ..._multiblock_partial_reduce_f16_f16_f16.cpp | 40 - ..._multiblock_partial_reduce_f16_f32_f16.cpp | 28 - ..._multiblock_partial_reduce_f32_f32_f32.cpp | 45 - ..._multiblock_partial_reduce_f32_f64_f32.cpp | 20 - ..._multiblock_partial_reduce_f64_f64_f64.cpp | 55 -- ...ce_multiblock_partial_reduce_i8_i32_i8.cpp | 24 - ...nce_multiblock_partial_reduce_i8_i8_i8.cpp | 40 - profiler/include/profile_reduce_impl.hpp | 422 +++------ profiler/src/profile_reduce.cpp | 218 ++--- script/test_reduce_no_index.sh | 11 + script/test_reduce_with_index.sh | 11 + test/reduce/reduce_no_index.cpp | 561 ++--------- test/reduce/reduce_util.hpp | 19 - test/reduce/reduce_with_index.cpp | 566 ++--------- 94 files changed, 2429 insertions(+), 6785 deletions(-) create mode 100644 example/12_reduce/reduce_blockwise_two_call.cpp delete mode 100644 include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp delete mode 100644 include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp rename include/ck/tensor_operation/gpu/device/{device_reduce_multiblock_atomic_add.hpp => device_reduce_multiblock.hpp} (58%) delete mode 100644 include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp create mode 100644 library/include/ck/library/host_tensor/host_common_util.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp delete mode 100644 test/reduce/reduce_util.hpp diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt index d6866abeb8..9045a78a85 100644 --- a/example/12_reduce/CMakeLists.txt +++ b/example/12_reduce/CMakeLists.txt @@ -1 +1,2 @@ -add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10) +add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) +add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp) diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md index 6fd3b3dcf3..a6442984e7 100644 --- a/example/12_reduce/README.md +++ b/example/12_reduce/README.md @@ -5,23 +5,38 @@ # -D : input 4-d tensor lengths # -v : verification (0=no, 1=yes) #arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) -#arg2: run kernel # of times (>1) -./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10 +#arg2: time kernel (0=no, 1=yes) +./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1 ``` Result ``` +./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 3 times... -Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> -error: 0 -max_diff: 0, 529, 529 -root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10 -launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} -Warm up +Warm up 1 time Start running 10 times... -Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> -error: 0 -max_diff: 0, 528, 528 +Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> ``` + +# Instructions for ```example_reduce_blockwise_two_call``` + +## Run ```example_reduce_blockwise_two_call``` +```bash +#arg1: verification (0=no, 1=yes( +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg3: time kernel (0=no, 1=yes) +./bin/example_reduce_blockwise_two_call 1 2 1 + + +Result +``` +./bin/example_reduce_blockwise_two_call 1 2 1 +launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> +``` + diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index b2d312ae8c..e1e3afc58a 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -12,8 +12,8 @@ #include "host_tensor_generator.hpp" #include "device_tensor.hpp" #include "device_base.hpp" -#include "device_reduce_blockwise.hpp" -#include "host_reduce_util.hpp" +#include "device_reduce_multiblock.hpp" +#include "host_common_util.hpp" #include "host_reduction.hpp" #include "reduction_enums.hpp" @@ -30,9 +30,8 @@ constexpr int Rank = 4; constexpr int NumReduceDim = 3; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2; -constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN; -constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; -constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES; +constexpr bool PropagateNan = true; +constexpr bool OutputIndex = false; using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = @@ -40,85 +39,44 @@ using InElementwiseOperation = using AccElementwiseOperation = typename reduce_unary_operator::AccElementwiseOperation; -using DeviceReduceInstance = DeviceReduceBlockWise; +using DeviceReduceInstance = DeviceReduceMultiBlock; static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, - {"scales", required_argument, nullptr, 'S'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, {nullptr, 0, nullptr, 0}}; class SimpleAppArgs { - template - static T getSingleValueFromString(const std::string& valueStr) - { - std::istringstream iss(valueStr); - - T ret; - - iss >> ret; - - return (ret); - }; - - template - static std::vector getTypeValuesFromString(const char* cstr_values) - { - std::string valuesStr(cstr_values); - - std::vector values; - std::size_t pos = 0; - std::size_t new_pos; - - new_pos = valuesStr.find(',', pos); - while(new_pos != std::string::npos) - { - const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); - - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - pos = new_pos + 1; - new_pos = valuesStr.find(',', pos); - }; - - std::string sliceStr = valuesStr.substr(pos); - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - return (values); - }; - private: int option_index = 0; public: - std::vector inLengths; - std::vector scales; + std::vector inLengths = {16, 64, 32, 960}; + std::vector scales = {1.0f, 0.0f}; bool do_verification = true; int init_method = 1; - bool time_kernel = false; + bool time_kernel = true; public: void show_usage(const char* cmd) @@ -126,24 +84,24 @@ class SimpleAppArgs std::cout << "Usage of " << cmd << std::endl; std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" << std::endl; - std::cout << "--scales or -S, comma separated two float values for alpha and beta" - << std::endl; std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " "comparing with the host-based reduction" << std::endl; std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " "value, 3=decimal value)" << std::endl; - std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl; + std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl; }; int processArgs(int argc, char* argv[]) { + using ck::host_common::getTypeValuesFromString; + int ch; while(1) { - ch = getopt_long(argc, argv, "D:S:v:l:", long_options, &option_index); + ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index); if(ch == -1) break; switch(ch) @@ -154,12 +112,6 @@ class SimpleAppArgs inLengths = getTypeValuesFromString(optarg); break; - case 'S': - if(!optarg) - throw std::runtime_error("Invalid option format!"); - - scales = getTypeValuesFromString(optarg); - break; case 'v': if(!optarg) throw std::runtime_error("Invalid option format!"); @@ -181,7 +133,7 @@ class SimpleAppArgs throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); init_method = std::atoi(argv[optind++]); - time_kernel = std::atoi(argv[optind]); + time_kernel = static_cast(std::atoi(argv[optind])); if(scales.empty()) { @@ -202,16 +154,16 @@ int main(int argc, char* argv[]) SimpleAppArgs args; - if(args.processArgs(argc, argv) < 0) - return (-1); + if(argc > 1) + { + if(args.processArgs(argc, argv) < 0) + return (-1); + }; constexpr bool op_support_indices = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = - (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES)); - // if input is half type, no reason to use float for indiced reduction operation and must use // float for non-indiced reduction operation for accuracy constexpr bool invalid_reduce_1 = @@ -225,8 +177,7 @@ int main(int argc, char* argv[]) (op_support_indices && !std::is_same::value); // indices option can only be used when it is really needed - constexpr bool invalid_reduce_3 = - (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES); + constexpr bool invalid_reduce_3 = (!op_support_indices && OutputIndex); constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3); @@ -294,9 +245,9 @@ int main(int argc, char* argv[]) if(beta != 0.0f) out_dev.ToDevice(out.mData.data()); - size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0; + size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0; - DeviceMem out_indices_dev(indicesSizeInBytes); + DeviceMem out_index_dev(indicesSizeInBytes); if(args.do_verification) { @@ -307,38 +258,39 @@ int main(int argc, char* argv[]) Rank, NumReduceDim, PropagateNan, - NeedIndices> + OutputIndex> hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce.Run( alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); }; - const auto i_inLengths = to_int_vector(args.inLengths); - const auto i_inStrides = to_int_vector(inStrides); - const auto i_outLengths = to_int_vector(outLengths); - const auto i_outStrides = to_int_vector(outStrides); + std::vector i_inLengths; + std::vector i_inStrides; + std::vector i_outLengths; + std::vector i_outStrides; + + i_inLengths.assign(args.inLengths.begin(), args.inLengths.end()); + i_inStrides.assign(inStrides.begin(), inStrides.end()); + i_outLengths.assign(outLengths.begin(), outLengths.end()); + i_outStrides.assign(outStrides.begin(), outStrides.end()); auto reduce = DeviceReduceInstance{}; - auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); - - auto argument_ptr = - reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - InElementwiseOperation{static_cast(reduce_total_length)}, - AccElementwiseOperation{static_cast(reduce_total_length)}); + auto argument_ptr = reduce.MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + nullptr, + out_dev.GetDeviceBuffer(), + out_index_dev.GetDeviceBuffer(), + InElementwiseOperation{static_cast(reduce_total_length)}, + AccElementwiseOperation{static_cast(reduce_total_length)}); if(!reduce.IsSupportedArgument(argument_ptr.get())) { @@ -362,16 +314,18 @@ int main(int argc, char* argv[]) << std::endl; bool pass = true; + if(args.do_verification) { out_dev.FromDevice(out.mData.data()); - pass &= ck::utils::check_err(out.mData, out_ref.mData); + pass = pass && ck::utils::check_err(out.mData, out_ref.mData); - if(NeedIndices) + if(OutputIndex) { - out_indices_dev.FromDevice(out_indices.mData.data()); - pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + out_index_dev.FromDevice(out_indices.mData.data()); + pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); }; }; - return pass ? 0 : 1; + + return (pass ? 0 : 1); } diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp new file mode 100644 index 0000000000..cd166c40fe --- /dev/null +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -0,0 +1,290 @@ +#include +#include +#include +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "device_tensor.hpp" +#include "device_base.hpp" +#include "device_reduce_multiblock.hpp" +#include "host_common_util.hpp" +#include "host_reduction.hpp" + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" + +using namespace ck; +using namespace ck::tensor_operation::device; + +using InOutDataType = ck::half_t; +using InOutDataType = ck::half_t; +using AccDataType = float; + +constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2; +constexpr bool PropagateNan = true; +constexpr bool OutputIndex = false; + +using ReduceOperation = typename reduce_binary_operator::opType; +using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; +using AccElementwiseOperation = + typename reduce_unary_operator::AccElementwiseOperation; + +using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + +using DeviceReduceInstance_1 = DeviceReduceMultiBlock; + +using DeviceReduceInstance_2 = DeviceReduceMultiBlock; + +static bool do_verify; +static int init_method; +static float alpha; +static float beta; +static bool time_kernel; + +int main(int argc, char* argv[]) +{ + // used by the device reduction + const std::vector reduceDims_1 = {4}; + const std::vector invariantDims_1 = {0, 1, 2, 3}; + + const std::vector reduceDims_2 = {3}; + const std::vector invariantDims_2 = {0, 1, 2}; + + // used by the host reduction + const std::vector reduceDims = {3, 4}; + const std::vector invariantDims = {0, 1, 2}; + + const std::vector inLengths_1 = {64, 320, 80, 4, 128}; + + // input lengths of the second reduction, which is also the output lengths of the first + // reduction + const std::vector inLengths_2 = {64, 320, 80, 4}; + + const std::vector outLengths = {64, 320, 80}; + + using namespace ck::host_reduce; + + if(argc == 1) + { + do_verify = true; + init_method = 2; + time_kernel = true; + } + else if(argc == 4) + { + do_verify = static_cast(argv[1]); + init_method = atoi(argv[2]); + time_kernel = static_cast(atoi(argv[3])); + } + else + { + std::ostringstream ostr; + + ostr << "Wrong parameter! " << std::endl + << "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl; + + throw std::runtime_error(ostr.str()); + }; + + alpha = 1.0f; + beta = 0.0f; + + Tensor in_1(inLengths_1); + + Tensor out_ref(outLengths); + Tensor in_2(inLengths_2); // also the output tensor of the first reduction + Tensor out(outLengths); + + auto inStrides_1 = in_1.mDesc.GetStrides(); + auto inStrides_2 = in_2.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in_1.mDesc.GetElementSize() / invariant_total_length; + + std::size_t num_thread = 1; + + if(do_verify) + { + switch(init_method) + { + case 0: break; + case 1: + in_1.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: + in_1.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in_1.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, + num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) + out.mData[i] = out_ref.mData[i]; + }; + + DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpace()); + DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpace()); + DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpace()); + + in_1_dev.ToDevice(in_1.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + if(do_verify) + { + ReductionHost + hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims); + + hostReduce.Run(alpha, in_1.mData.data(), beta, out_ref.mData.data(), nullptr); + }; + + std::vector i_inLengths_1; + std::vector i_inStrides_1; + std::vector i_inLengths_2; + std::vector i_inStrides_2; + std::vector i_outLengths; + std::vector i_outStrides; + + i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end()); + i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end()); + i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end()); + i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end()); + i_outLengths.assign(outLengths.begin(), outLengths.end()); + i_outStrides.assign(outStrides.begin(), outStrides.end()); + + auto reduce_1 = DeviceReduceInstance_1{}; + + auto argument_ptr_1 = reduce_1.MakeArgumentPointer( + i_inLengths_1, + i_inStrides_1, + i_inLengths_2, + i_inStrides_2, + reduceDims_1, + 1.0f, + 0.0f, + in_1_dev.GetDeviceBuffer(), + nullptr, + in_2_dev.GetDeviceBuffer(), + nullptr, + InElementwiseOperation{static_cast(reduce_total_length)}, + PassThroughOp{}); + + if(!reduce_1.IsSupportedArgument(argument_ptr_1.get())) + { + std::cout + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + }; + + auto invoker_ptr_1 = reduce_1.MakeInvokerPointer(); + + auto reduce_2 = DeviceReduceInstance_2{}; + + auto argument_ptr_2 = reduce_2.MakeArgumentPointer( + i_inLengths_2, + i_inStrides_2, + i_outLengths, + i_outStrides, + reduceDims_2, + alpha, + beta, + in_2_dev.GetDeviceBuffer(), + nullptr, + out_dev.GetDeviceBuffer(), + nullptr, + PassThroughOp{}, + AccElementwiseOperation{static_cast(reduce_total_length)}); + + if(!reduce_2.IsSupportedArgument(argument_ptr_2.get())) + { + std::cout + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + }; + + auto invoker_ptr_2 = reduce_2.MakeInvokerPointer(); + + float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel}); + float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) + + invariant_total_length * sizeof(InOutDataType); + + float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2); + + std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, " + << reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl; + + bool pass = true; + + if(do_verify) + { + out_dev.FromDevice(out.mData.data()); + pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + }; + + return (pass ? 0 : 1); +} diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md index d9c829fb98..2314cfd670 100644 --- a/example/13_pool2d_fwd/README.md +++ b/example/13_pool2d_fwd/README.md @@ -4,9 +4,9 @@ ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) -#arg3: run kernel # of times (>1) +#arg3: time kernel (0=no, 1=yes) #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx -./bin/example_pool2d_fwd 1 1 10 +./bin/example_pool2d_fwd 1 1 1 ``` Result @@ -14,9 +14,7 @@ Result in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192} launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} -Warm up +Warm up 1 time Start running 10 times... -Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s -error: 0 -max_diff: 0, 1, 1 +Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s ``` diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index e6749bf8d7..662a48500f 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -20,6 +20,8 @@ using InDataType = ck::half_t; using OutDataType = ck::half_t; using AccDataType = float; +using IndexDataType = int32_t; + using InLayout = ck::tensor_layout::convolution::NHWC; using OutLayout = ck::tensor_layout::convolution::NHWC; @@ -29,7 +31,7 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; #endif -static constexpr bool NeedIndices = false; +static constexpr bool OutputIndex = false; static constexpr bool PropagateNan = false; using DevicePoolFwdInstance = @@ -38,7 +40,7 @@ using DevicePoolFwdInstance = OutDataType, // OutDataType AccDataType, // AccDataType ReduceOpId, - NeedIndices, + OutputIndex, 64, // BlockSize 64, // ReduceMThreadClusterSize 1, // ReduceKThreadClusterSize @@ -51,10 +53,10 @@ template + bool OutputIndex> static void pool_host_verify(const Tensor& in, Tensor& out, - Tensor& out_indices, + Tensor& out_indices, const std::array& window_spatial_lengths, const std::array& window_strides, const std::array& in_left_pads, @@ -62,26 +64,26 @@ static void pool_host_verify(const Tensor& in, { using namespace ck::host_reduce; - const int divider = window_spatial_lengths[0] * window_spatial_lengths[1]; + const int32_t divider = window_spatial_lengths[0] * window_spatial_lengths[1]; const auto PreUnaryOp = PreUnaryOpFn(divider); const auto PosUnaryOp = PosUnaryOpFn(divider); - if constexpr(!NeedIndices) + if constexpr(!OutputIndex) { auto opReduce = ReduceOpFn(); auto f_nchw = [&](auto n, auto c, auto ho, auto wo) { auto accuVal = ReduceOpZeroVal(); - for(int y = 0; y < window_spatial_lengths[0]; ++y) + for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y) { - int hi = ho * window_strides[0] + y - in_left_pads[0]; - for(int x = 0; x < window_spatial_lengths[1]; ++x) + ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0]; + for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x) { - int wi = wo * window_strides[1] + x - in_left_pads[1]; - if(hi >= 0 && hi < ck::type_convert(in.mDesc.GetLengths()[2]) && wi >= 0 && - wi < ck::type_convert(in.mDesc.GetLengths()[3])) + ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1]; + if(hi >= 0 && hi < static_cast(in.mDesc.GetLengths()[2]) && + wi >= 0 && wi < static_cast(in.mDesc.GetLengths()[3])) { AccDataType currVal = static_cast(in(n, c, hi, wi)); @@ -108,24 +110,24 @@ static void pool_host_verify(const Tensor& in, auto opReduce = ReduceOpFn2(); auto f_nchw = [&](auto n, auto c, auto ho, auto wo) { - auto accuVal = ReduceOpZeroVal(); - int accuIndex = 0; + auto accuVal = ReduceOpZeroVal(); + IndexDataType accuIndex = 0; - for(int y = 0; y < window_spatial_lengths[0]; ++y) + for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y) { - int hi = ho * window_strides[0] + y - in_left_pads[0]; - for(int x = 0; x < window_spatial_lengths[1]; ++x) + ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0]; + for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x) { - int wi = wo * window_strides[1] + x - in_left_pads[1]; + ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1]; if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && wi < in.mDesc.GetLengths()[3]) { - AccDataType currVal = static_cast(in(n, c, hi, wi)); - int currIndex = y * window_spatial_lengths[1] + x; + AccDataType currVal = static_cast(in(n, c, hi, wi)); + IndexDataType currIndex = y * window_spatial_lengths[1] + x; PreUnaryOp(currVal); - binop_with_nan_check2( + binop_with_index_and_nan_check( opReduce, accuVal, currVal, accuIndex, currIndex); } } @@ -149,9 +151,9 @@ int main(int argc, char* argv[]) { using namespace ck::host_reduce; - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; + bool do_verification; + int init_method; + bool time_kernel; // Pool shape ck::index_t N = 128; @@ -167,17 +169,23 @@ int main(int argc, char* argv[]) ck::index_t in_right_pad_h = 1; ck::index_t in_right_pad_w = 1; - if(argc == 4) + if(argc == 1) + { + do_verification = true; + init_method = 1; + time_kernel = true; + } + else if(argc == 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); + time_kernel = static_cast(std::stoi(argv[3])); } else if(argc == 16) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); + time_kernel = static_cast(std::stoi(argv[3])); N = std::stoi(argv[4]); C = std::stoi(argv[5]); @@ -196,7 +204,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: time kernel (0=n0, 1=yes)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(0); @@ -228,9 +236,11 @@ int main(int argc, char* argv[]) Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); Tensor out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); - Tensor out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); + Tensor out_indices_n_c_ho_wo_host( + f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); Tensor out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); - Tensor out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); + Tensor out_indices_n_c_ho_wo_device( + f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl; @@ -245,25 +255,25 @@ int main(int argc, char* argv[]) DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); DeviceMem out_device_buf(sizeof(OutDataType) * out_n_c_ho_wo_device.mDesc.GetElementSpace()); - DeviceMem out_indices_device_buf(sizeof(int) * + DeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_indices_n_c_ho_wo_device.mDesc.GetElementSpace()); in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); - auto pool = DevicePoolFwdInstance{}; - auto invoker_ptr = pool.MakeInvokerPointer(); - auto argument_ptr = - pool.MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - static_cast(out_indices_device_buf.GetDeviceBuffer()), - N, - C, - std::array{{Hi, Wi}}, - std::array{{Y, X}}, - std::array{{Ho, Wo}}, - window_strides, - input_left_pads, - input_right_pads); + auto pool = DevicePoolFwdInstance{}; + auto invoker_ptr = pool.MakeInvokerPointer(); + auto argument_ptr = pool.MakeArgumentPointer( + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + static_cast(out_indices_device_buf.GetDeviceBuffer()), + N, + C, + std::array{{Hi, Wi}}, + std::array{{Y, X}}, + std::array{{Ho, Wo}}, + window_strides, + input_left_pads, + input_right_pads); if(!pool.IsSupportedArgument(argument_ptr.get())) { @@ -286,6 +296,7 @@ int main(int argc, char* argv[]) << std::endl; bool pass = true; + if(do_verification) { pool_host_verify(in_n_c_hi_wi, + OutputIndex>(in_n_c_hi_wi, out_n_c_ho_wo_host, out_indices_n_c_ho_wo_host, window_spatial_lengths, @@ -303,15 +314,16 @@ int main(int argc, char* argv[]) out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); - pass &= ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); + pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); - if constexpr(NeedIndices) + if constexpr(OutputIndex) { out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); - pass &= ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, - out_indices_n_c_ho_wo_host.mData); + pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, + out_indices_n_c_ho_wo_host.mData); }; } - return pass ? 0 : 1; + + return (pass ? 0 : 1); } diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp index f665378e08..c7e18d98dc 100644 --- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp +++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp @@ -17,7 +17,7 @@ template :: AccElementwiseOperation; - static constexpr bool BetaIsZero = true; - static constexpr index_t InSrcOutDstVectorDim = 0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is // not reduced. @@ -206,28 +204,28 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd { float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise; + using gridwise_reduce = + GridwiseReduction_mk_to_m_threadwise; const auto kernel = kernel_reduce_threadwise struct DeviceReduce : public BaseOperator { - virtual long_index_t GetWorkspaceSizeInBytes(const std::vector inLengths, - const std::vector reduceDims) - { - (void)inLengths; - (void)reduceDims; - - return (0); - }; - - virtual bool HasFurtherCall() { return (false); }; - - virtual std::vector GetWorkspace2dLengths(const BaseArgument* argPtr) - { - (void)argPtr; - return (std::vector{0, 0}); - }; - virtual std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, const std::vector reduceDims, float alpha, float beta, const void* in_dev, + const void* in_index_dev, void* out_dev, - void* out_indices_dev, - void* workspace_dev, + void* out_index_dev, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op) = 0; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp deleted file mode 100644 index 860f53d8c5..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp +++ /dev/null @@ -1,374 +0,0 @@ -#ifndef DEVICE_REDUCE_BLOCKWISE_HPP -#define DEVICE_REDUCE_BLOCKWISE_HPP - -#include -#include -#include "device.hpp" -#include "device_reduce.hpp" -#include "device_reduce_common.hpp" -#include "gridwise_2d_reduction_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceReduceBlockWise : public DeviceReduce -{ - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); - static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, - "Invalid thread cluster size assignments!"); - - static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || - (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && - (MThreadSliceSize % OutDstVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - using IndexDataType = int32_t; - - static constexpr bool BetaIsZero = NeedIndices; - - static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - - static constexpr index_t numSrcDim = Rank; - static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; - static constexpr bool reduceAllDim = (NumInvariantDim == 0); - - static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - - static auto MakeSrc2dDescriptor(const std::vector& inLengths, - const std::vector& inStrides) - { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); - - const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - - const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDim) - { - const auto one_dim_inDesc = transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - return transform_tensor_descriptor(one_dim_inDesc, - make_tuple(make_unmerge_transform(make_tuple( - 1, one_dim_inDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - } - else - { - using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); - - return transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(reduceDimLengths)), - make_tuple(InvariantDims{}, ReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - }(); - - const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const auto inPad_M = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - const auto inPad_K = - math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength; - - auto in_grid_desc_m_k_padded = transform_tensor_descriptor( - in_grid_desc_m_k, - make_tuple(make_right_pad_transform(invariantLength, inPad_M), - make_right_pad_transform(reduceLength, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return (in_grid_desc_m_k_padded); - }; - - static auto MakeDst1dDescriptor(const std::vector& outLengths, - const std::vector& outStrides) - { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); - - auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto out_grid_desc_m = transform_tensor_descriptor( - outDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); - - const auto inPad = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - - auto out_grid_desc_m_padded = transform_tensor_descriptor( - out_grid_desc_m, - make_tuple(make_right_pad_transform(invariantLength, inPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return (out_grid_desc_m_padded); - }; - - struct Argument : public BaseArgument - { - Argument(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, - float alpha, - float beta, - const InDataType* in_dev, - OutDataType* out_dev, - IndexDataType* out_indices_dev, - AccDataType* workspace_dev, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op) - : outLengths_{outLengths}, - outStrides_{outStrides}, - in_dev_{in_dev}, - out_dev_{out_dev}, - out_indices_dev_{out_indices_dev}, - in_elementwise_op_{in_elementwise_op}, - acc_elementwise_op_{acc_elementwise_op} - { - (void)workspace_dev; - - inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); - inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); - - alpha_ = type_convert(alpha); - beta_ = type_convert(beta); - - std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); - - if constexpr(NumInvariantDim == 0) - invariant_lowest_length = 1; - else - invariant_lowest_length = inLengths_[NumInvariantDim - 1]; - - reduce_lowest_length = inLengths_[Rank - 1]; - - gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / - M_BlockTileSize; - } - - std::vector inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; - - AccDataType alpha_; - AccDataType beta_; - - const InDataType* in_dev_; - OutDataType* out_dev_; - IndexDataType* out_indices_dev_; - - InElementwiseOperation in_elementwise_op_; - AccElementwiseOperation acc_elementwise_op_; - - int invariant_lowest_length; - int reduce_lowest_length; - size_t invariant_total_length; - size_t reduce_total_length; - - size_t gridSize; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto in_grid_desc_m_k = - DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_); - const auto out_grid_desc_m = - DeviceReduceBlockWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_); - using InGridDesc_M_K = decltype(in_grid_desc_m_k); - using OutGridDesc_M = decltype(out_grid_desc_m); - - using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise; - - float avg_time = 0; - - const auto kernel = kernel_reduce_blockwise; - - avg_time = launch_and_time_kernel(stream_config, - kernel, - dim3(arg.gridSize), - dim3(BlockSize), - 0, - in_grid_desc_m_k, - out_grid_desc_m, - arg.in_elementwise_op_, - arg.acc_elementwise_op_, - arg.alpha_, - arg.in_dev_, - arg.beta_, - arg.out_dev_, - nullptr, - arg.out_indices_dev_); - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - }; - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if constexpr(InSrcVectorDim == 0) - { - if constexpr(NumInvariantDim == 0) - { - return (false); - } - else - { - if(pArg->inStrides_[NumInvariantDim - 1] != 1) - return (false); - - if(pArg->invariant_lowest_length % InSrcVectorSize != 0) - return (false); - }; - } - else - { - if(pArg->inStrides_[Rank - 1] != 1) - return (false); - - if(pArg->reduce_lowest_length % InSrcVectorSize != 0) - return (false); - }; - - // To improve - if(pArg->invariant_lowest_length % OutDstVectorSize != 0) - return (false); - - // cases with very small reduce_total_length should be handled by the ThreadWise method - if(pArg->reduce_total_length / KThreadSliceSize < 2) - return (false); - - return (true); - }; - - std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, - float alpha, - float beta, - const void* in_dev, - void* out_dev, - void* out_indices_dev, - void* workspace_dev, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op) override - { - return std::make_unique(inLengths, - inStrides, - outLengths, - outStrides, - reduceDims, - alpha, - beta, - static_cast(in_dev), - static_cast(out_dev), - static_cast(out_indices_dev), - static_cast(workspace_dev), - in_elementwise_op, - acc_elementwise_op); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceReduceBlockWise<" << BlockSize << ","; - str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp deleted file mode 100644 index 43ac48cecc..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp +++ /dev/null @@ -1,328 +0,0 @@ -#ifndef DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP -#define DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP - -#include -#include -#include "device.hpp" -#include "device_reduce.hpp" -#include "device_reduce_common.hpp" -#include "gridwise_2d_reduction_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceReduceBlockWiseSecondCall - : public DeviceReduce -{ - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); - static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, - "Invalid thread cluster size assignments!"); - - static_assert((InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0) && - (MThreadSliceSize % OutDstVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - using IndexDataType = int32_t; - - static constexpr bool BetaIsZero = NeedIndices; - - static_assert( - std::is_same::value, - "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!"); - - static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - - static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; - - static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - - static auto MakeSrc2dDescriptor(const std::vector& inLengths, - const std::vector& inStrides) - { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<2>{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<2>{}); - - const auto in_grid_desc_m_k = - make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - - const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const auto inPad_M = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - const auto inPad_K = - math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength; - - auto in_grid_desc_m_k_padded = transform_tensor_descriptor( - in_grid_desc_m_k, - make_tuple(make_right_pad_transform(invariantLength, inPad_M), - make_right_pad_transform(reduceLength, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return (in_grid_desc_m_k_padded); - }; - - static auto MakeDst1dDescriptor(const std::vector& outLengths, - const std::vector& outStrides) - { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); - - auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto out_grid_desc_m = transform_tensor_descriptor( - outDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); - - const auto outPad = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - - auto out_grid_desc_m_padded = transform_tensor_descriptor( - out_grid_desc_m, - make_tuple(make_right_pad_transform(invariantLength, outPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return (out_grid_desc_m_padded); - }; - - struct Argument : public BaseArgument - { - Argument(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - float alpha, - float beta, - const InDataType* in_dev, - OutDataType* out_dev, - IndexDataType* out_indices_dev, - AccDataType* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) - : inLengths_(inLengths), - inStrides_(inStrides), - outLengths_(outLengths), - outStrides_(outStrides), - in_dev_{in_dev}, - out_dev_{out_dev}, - out_indices_dev_{out_indices_dev}, - in_elementwise_op_(in_elementwise_op), - acc_elementwise_op_(acc_elementwise_op) - { - alpha_ = type_convert(alpha); - beta_ = type_convert(beta); - - invariant_total_length = inLengths[0]; - reduce_total_length = inLengths[1]; - - invariant_lowest_length = inLengths[0]; - reduce_lowest_length = inLengths[1]; - - gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / - M_BlockTileSize; - - size_t ws_buf2_bytes_offset = math::integer_least_multiple( - invariant_total_length * reduce_total_length * sizeof(AccDataType), 64); - - if constexpr(NeedIndices) - workspace_indices_dev_ = reinterpret_cast( - reinterpret_cast(workspace_dev) + ws_buf2_bytes_offset); - else - workspace_indices_dev_ = nullptr; - } - - std::vector inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; - - AccDataType alpha_; - AccDataType beta_; - - const InDataType* in_dev_; - OutDataType* out_dev_; - IndexDataType* out_indices_dev_; - IndexDataType* workspace_indices_dev_; - - InElementwiseOperation in_elementwise_op_; - AccElementwiseOperation acc_elementwise_op_; - - int invariant_lowest_length; - int reduce_lowest_length; - size_t invariant_total_length; - size_t reduce_total_length; - - size_t gridSize; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor( - arg.inLengths_, arg.inStrides_); - const auto out_grid_desc_m = DeviceReduceBlockWiseSecondCall::MakeDst1dDescriptor( - arg.outLengths_, arg.outStrides_); - using InGridDesc_M_K = decltype(in_grid_desc_m_k); - using OutGridDesc_M = decltype(out_grid_desc_m); - - using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise; - - float avg_time = 0; - - const auto kernel = kernel_reduce_blockwise_second_call; - - avg_time = launch_and_time_kernel(stream_config, - kernel, - dim3(arg.gridSize), - dim3(BlockSize), - 0, - in_grid_desc_m_k, - out_grid_desc_m, - arg.in_elementwise_op_, - arg.acc_elementwise_op_, - arg.alpha_, - arg.in_dev_, - arg.beta_, - arg.out_dev_, - arg.workspace_indices_dev_, - arg.out_indices_dev_); - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if constexpr(InSrcVectorDim == 0) - return (false); - - if(pArg->reduce_lowest_length % InSrcVectorSize != 0) - return (false); - - // To improve - if(pArg->invariant_lowest_length % OutDstVectorSize != 0) - return (false); - - // cases with very small reduce_total_length should be handled by the ThreadWise method - if(pArg->reduce_total_length / KThreadSliceSize < 2) - return (false); - - return (true); - }; - - std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, - float alpha, - float beta, - const void* in_dev, - void* out_dev, - void* out_indices_dev, - void* workspace_dev, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op) override - { - (void)reduceDims; - - return std::make_unique(inLengths, - inStrides, - outLengths, - outStrides, - alpha, - beta, - static_cast(in_dev), - static_cast(out_dev), - static_cast(out_indices_dev), - static_cast(workspace_dev), - in_elementwise_op, - acc_elementwise_op); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceReduceBlockWiseSecondCall<" << BlockSize << ","; - str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp index 038c754722..f68a392821 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp @@ -14,13 +14,13 @@ namespace device { // here, inLengths[] is already shuffled so that lengths of invariant dims are included before those // of reduce dims -template -std::pair get_2d_lengths(const std::vector& inLengths) +template +std::pair get_2d_lengths(const std::vector& inLengths) { static_assert(Rank <= 6, "bigger Rank size not supported!"); - size_t invariant_total_length = 1; - size_t reduce_total_length = 1; + long_index_t invariant_total_length = 1; + long_index_t reduce_total_length = 1; constexpr int NumInvariantDim = Rank - NumReduceDim; @@ -35,13 +35,13 @@ std::pair get_2d_lengths(const std::vector& inLengths) // helper functions using variadic template arguments template -auto make_tuple_from_array_and_index_seq(const std::vector& lengths, Sequence) +auto make_tuple_from_array_and_index_seq(const std::vector& lengths, Sequence) { return make_tuple(static_cast(lengths[Ns])...); }; template -static auto make_tuple_from_array(const std::vector& lengths, Number) +auto make_tuple_from_array(const std::vector& lengths, Number) { static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); @@ -51,10 +51,10 @@ static auto make_tuple_from_array(const std::vector& lengths, Number -std::vector shuffle_tensor_dimensions(const std::vector& origLengthsStrides, - const std::vector& reduceDims) +std::vector shuffle_tensor_dimensions(const std::vector& origLengthsStrides, + const std::vector& reduceDims) { - std::vector newLengthsStrides; + std::vector newLengthsStrides; assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size()); diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp similarity index 58% rename from include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp rename to include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp index f93c65fe18..2f447c0979 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp @@ -1,5 +1,5 @@ -#ifndef DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP -#define DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP +#ifndef DEVICE_REDUCE_MULTIBLOCK_HPP +#define DEVICE_REDUCE_MULTIBLOCK_HPP #include #include @@ -7,8 +7,9 @@ #include "device_base.hpp" #include "device_reduce.hpp" #include "device_reduce_common.hpp" -#include "gridwise_2d_reduction_multiblock_atomic_add.hpp" +#include "gridwise_2d_reduction_multiblock.hpp" #include "gridwise_set_buffer_value.hpp" +#include "reduction_operator.hpp" namespace ck { namespace tensor_operation { @@ -22,8 +23,10 @@ template -struct DeviceReduceMultiBlockAtomicAdd - : public DeviceReduce +struct DeviceReduceMultiBlock : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, @@ -46,26 +48,40 @@ struct DeviceReduceMultiBlockAtomicAdd using IndexDataType = int32_t; + static constexpr bool HaveIndexInput = OutputIndex && HaveIndexInputIfOutputIndex; + static constexpr index_t NumInvariantDim = Rank - NumReduceDim; static constexpr index_t numSrcDim = Rank; static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; static constexpr bool reduceAllDim = (NumInvariantDim == 0); - static constexpr bool support_AtomicAdd = + // So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added + // later + static constexpr bool use_multiblock = + (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd); + + static constexpr bool out_type_compatible_with_atomic_op = std::is_same::value || std::is_same::value; - static_assert(!NeedIndices && support_AtomicAdd, - "MultiBlockAtomicAdd method can only be used with non-indiced operation and when " - "having float/double output type!"); + static_assert( + !use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op), + "The OutDataType must support the atomic operation for using MultiBlock reduction"); - static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + static_assert(!use_multiblock || (use_multiblock && !OutputIndex), + "MultiBlock reduction can only be used when outputing index is not required"); - static auto MakeSrc2dDescriptor(const std::vector& inLengths, - const std::vector& inStrides, + static_assert( + ReduceOperation::IsCompatibleInMemoryDataOperation(OutMemoryDataOperation), + "The reduction accumulation operation must be compatible with the OutMemoryDataOperation!"); + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides, int blkGroupSize, - int kBlockTileIterations) + int numBlockTileIteration) { const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); @@ -109,7 +125,7 @@ struct DeviceReduceMultiBlockAtomicAdd const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations; + const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; const auto inPad_M = math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; @@ -124,8 +140,8 @@ struct DeviceReduceMultiBlockAtomicAdd return (in_grid_desc_m_k_padded); }; - static auto MakeDst1dDescriptor(const std::vector& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptor(const std::vector& outLengths, + const std::vector& outStrides) { const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); @@ -151,31 +167,56 @@ struct DeviceReduceMultiBlockAtomicAdd return (out_grid_desc_m_padded); }; + static auto MakeDst1dDescriptorForBufferSet(const std::vector& outLengths, + const std::vector& outStrides) + { + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto length = out_grid_desc_m.GetLength(Number<0>{}); + + const auto pad = math::integer_least_multiple(length, BlockSize) - length; + + auto out_grid_desc_m_padded = + transform_tensor_descriptor(out_grid_desc_m, + make_tuple(make_right_pad_transform(length, pad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + struct Argument : public BaseArgument { - Argument(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, const std::vector reduceDims, float alpha, float beta, const InDataType* in_dev, + const IndexDataType* in_index_dev, OutDataType* out_dev, - IndexDataType* out_indices_dev, - AccDataType* workspace_dev, + IndexDataType* out_index_dev, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op) : outLengths_{outLengths}, outStrides_{outStrides}, in_dev_{in_dev}, + in_index_dev_{in_index_dev}, out_dev_{out_dev}, + out_index_dev_{out_index_dev}, in_elementwise_op_{in_elementwise_op}, acc_elementwise_op_{acc_elementwise_op} { - (void)out_indices_dev; - (void)workspace_dev; - inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); @@ -192,24 +233,35 @@ struct DeviceReduceMultiBlockAtomicAdd reduce_lowest_length = inLengths_[Rank - 1]; - int iterations = 1; - while(true) + if constexpr(use_multiblock) { - int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / - (K_BlockTileSize * iterations); - // we want the blkGroupSize be not more than 128 - if(testBlkGroupSize <= 128) - break; + int iterations = 1; + while(true) + { + int testBlkGroupSize = + (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); - iterations++; + // we want the blkGroupSize be not more than 128 + if(testBlkGroupSize <= 128) + break; + + iterations++; + }; + + blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + numBlockTileIteration = iterations; + } + else + { + blkGroupSize = 1; + numBlockTileIteration = + (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize; }; - blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / - (K_BlockTileSize * iterations); - - kBlockTileIterations = iterations; - gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / M_BlockTileSize * blkGroupSize; @@ -217,27 +269,29 @@ struct DeviceReduceMultiBlockAtomicAdd math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize; } - std::vector inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; AccDataType alpha_; AccDataType beta_; const InDataType* in_dev_; + const IndexDataType* in_index_dev_; OutDataType* out_dev_; + IndexDataType* out_index_dev_; InElementwiseOperation in_elementwise_op_; AccElementwiseOperation acc_elementwise_op_; - int invariant_lowest_length; - int reduce_lowest_length; - size_t invariant_total_length; - size_t reduce_total_length; + index_t invariant_lowest_length; + index_t reduce_lowest_length; + long_index_t invariant_total_length; + long_index_t reduce_total_length; - index_t blkGroupSize; - index_t kBlockTileIterations; + int blkGroupSize; + int numBlockTileIteration; size_t gridSize; size_t gridSize_pre; @@ -247,52 +301,69 @@ struct DeviceReduceMultiBlockAtomicAdd { float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor( - arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); - const auto out_grid_desc_m = DeviceReduceMultiBlockAtomicAdd::MakeDst1dDescriptor( + const auto in_grid_desc_m_k = DeviceReduceMultiBlock::MakeSrc2dDescriptor( + arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration); + const auto out_grid_desc_m = + DeviceReduceMultiBlock::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_); + const auto out_grid_desc_m_2 = DeviceReduceMultiBlock::MakeDst1dDescriptorForBufferSet( arg.outLengths_, arg.outStrides_); - using InGridDesc_M_K = decltype(in_grid_desc_m_k); - using OutGridDesc_M = decltype(out_grid_desc_m); - using GridwiseReduce = - GridwiseReduction_mk_to_m_multiblock_atomic_add; + using InGridDesc_M_K = decltype(in_grid_desc_m_k); + using OutGridDesc_M = decltype(out_grid_desc_m); + using OutGridDesc_M_2 = decltype(out_grid_desc_m_2); + + using GridwiseReduce = GridwiseReduction_mk_to_m_multiblock; + + const auto kernel_main = kernel_reduce_multiblock; float avg_time = 0; - const auto kernel_pre = kernel_buffer_set_value; - const auto kernel_main = kernel_reduce_multiblock_atocmi_add; + if constexpr(use_multiblock) + { + const auto zeroVal = + ck::reduce::GetReductionZeroValueForInMemoryDataOperation( + OutMemoryDataOperation); - avg_time += launch_and_time_kernel(stream_config, - kernel_pre, - dim3(arg.gridSize_pre), - dim3(BlockSize), - 0, - out_grid_desc_m, - arg.out_dev_, - static_cast(0.0f)); + const auto kernel_pre = + kernel_buffer_set_value; + + avg_time += launch_and_time_kernel(stream_config, + kernel_pre, + dim3(arg.gridSize_pre), + dim3(BlockSize), + 0, + out_grid_desc_m_2, + arg.out_dev_, + zeroVal); + }; avg_time += launch_and_time_kernel(stream_config, kernel_main, @@ -304,25 +375,34 @@ struct DeviceReduceMultiBlockAtomicAdd arg.in_elementwise_op_, arg.acc_elementwise_op_, arg.blkGroupSize, - arg.kBlockTileIterations, + arg.numBlockTileIteration, arg.alpha_, arg.in_dev_, - arg.out_dev_); + arg.in_index_dev_, + arg.beta_, + arg.out_dev_, + arg.out_index_dev_); - return avg_time; - } + return (avg_time); + }; float Run(const BaseArgument* p_arg, const StreamConfig& stream_config = StreamConfig{}) override { return Run(*dynamic_cast(p_arg), stream_config); - } + }; }; bool IsSupportedArgument(const BaseArgument* p_arg) override { const Argument* pArg = dynamic_cast(p_arg); + if constexpr(use_multiblock) + { + if(static_cast(pArg->beta_) != 0.0f) + return (false); + }; + if constexpr(InSrcVectorDim == 0) { if constexpr(NumInvariantDim == 0) @@ -347,36 +427,43 @@ struct DeviceReduceMultiBlockAtomicAdd return (false); }; - if(static_cast(pArg->beta_) != 0.0f) - return (false); - // To improve if(pArg->invariant_lowest_length % OutDstVectorSize != 0) return (false); - // cases with small reduce_total_length should be handled by the BlockWise method - if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize) - return (false); + if constexpr(use_multiblock) + { + // blkGroupSize of 1 should be handled by Blockwise path using + // InMemoryDataOperationEnum::Set + if(pArg->blkGroupSize == 1) + return (false); - // This is very strong restriction, but needed to avoid some failure - if(pArg->invariant_lowest_length % M_BlockTileSize != 0) - return (false); + // This is very strong restriction, but needed to avoid some failure + if(pArg->invariant_lowest_length % M_BlockTileSize != 0) + return (false); + } + else + { + // cases with very small reduce_total_length should be handled by ThreadWise kernel + if(pArg->reduce_total_length / KThreadSliceSize < 2) + return (false); + }; return (true); }; std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, const std::vector reduceDims, float alpha, float beta, const void* in_dev, + const void* in_index_dev, void* out_dev, - void* out_indices_dev, - void* workspace_dev, + void* out_index_dev, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op) override { @@ -388,9 +475,9 @@ struct DeviceReduceMultiBlockAtomicAdd alpha, beta, static_cast(in_dev), + static_cast(in_index_dev), static_cast(out_dev), - static_cast(out_indices_dev), - static_cast(workspace_dev), + static_cast(out_index_dev), in_elementwise_op, acc_elementwise_op); }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp deleted file mode 100644 index b4eb8116c2..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp +++ /dev/null @@ -1,440 +0,0 @@ -#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP -#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP - -#include -#include -#include "device.hpp" -#include "device_reduce.hpp" -#include "device_reduce_common.hpp" -#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceReduceMultiBlockPartialReduce - : public DeviceReduce -{ - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); - static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, - "Invalid thread cluster size assignments!"); - - static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || - (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!"); - - using IndexDataType = int32_t; - - static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - - static constexpr index_t numSrcDim = Rank; - static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; - static constexpr bool reduceAllDim = (NumInvariantDim == 0); - - static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - - static constexpr int MaxBlockGroupSize = 256; - - long_index_t GetWorkspaceSizeInBytes(const std::vector inLengths, - const std::vector reduceDims) override - { - size_t invariant_total_length; - size_t reduce_total_length; - - auto inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); - - std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); - - int iterations = 1; - while(true) - { - int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / - (K_BlockTileSize * iterations); - - if(testBlkGroupSize <= MaxBlockGroupSize) - break; - - iterations++; - }; - - int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / - (K_BlockTileSize * iterations); - - long_index_t workspace_size = invariant_total_length * blkGroupSize; - - long_index_t wsSizeInBytes = - !NeedIndices - ? workspace_size * sizeof(AccDataType) - : workspace_size * (sizeof(AccDataType) + sizeof(int32_t)) + 64 + sizeof(int); - - return (wsSizeInBytes); - }; - - bool HasFurtherCall() override { return (true); }; - - static auto MakeSrc2dDescriptor(const std::vector& inLengths, - const std::vector& inStrides, - int blkGroupSize, - int kBlockTileIterations) - { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); - - const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - - const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDim) - { - const auto one_dim_inDesc = transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - return transform_tensor_descriptor(one_dim_inDesc, - make_tuple(make_unmerge_transform(make_tuple( - 1, one_dim_inDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - } - else - { - using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); - - return transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(reduceDimLengths)), - make_tuple(InvariantDims{}, ReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - }(); - - const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations; - const auto inPad_M = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; - - auto in_grid_desc_m_k_padded = transform_tensor_descriptor( - in_grid_desc_m_k, - make_tuple(make_right_pad_transform(invariantLength, inPad_M), - make_right_pad_transform(reduceLength, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return (in_grid_desc_m_k_padded); - }; - - static auto MakeWorkspace2dDescriptor(int invariantLength, int blkGroupSize) - { - auto ws_desc_m_k = - make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize)); - - const auto wsPad = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - - auto ws_desc_m_k_padded = - transform_tensor_descriptor(ws_desc_m_k, - make_tuple(make_right_pad_transform(invariantLength, wsPad), - make_pass_through_transform(blkGroupSize)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return (ws_desc_m_k_padded); - }; - - struct Argument : public BaseArgument - { - Argument(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, - float alpha, - float beta, - const InDataType* in_dev, - OutDataType* out_dev, - IndexDataType* out_indices_dev, - AccDataType* workspace_dev, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op) - : outLengths_{outLengths}, - outStrides_{outStrides}, - in_dev_{in_dev}, - out_dev_{out_dev}, - out_indices_dev_{out_indices_dev}, - workspace_dev_{workspace_dev}, - in_elementwise_op_{in_elementwise_op}, - acc_elementwise_op_{acc_elementwise_op} - { - inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); - inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); - - alpha_ = type_convert(alpha); - beta_ = type_convert(beta); - - std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); - - if constexpr(NumInvariantDim == 0) - invariant_lowest_length = 1; - else - invariant_lowest_length = inLengths_[NumInvariantDim - 1]; - - reduce_lowest_length = inLengths_[Rank - 1]; - - int iterations = 1; - while(true) - { - int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / - (K_BlockTileSize * iterations); - - if(testBlkGroupSize <= MaxBlockGroupSize) - break; - - iterations++; - }; - - blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / - (K_BlockTileSize * iterations); - - kBlockTileIterations = iterations; - - gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / - M_BlockTileSize * blkGroupSize; - - size_t ws_buf2_bytes_offset = math::integer_least_multiple( - invariant_total_length * blkGroupSize * sizeof(AccDataType), 64); - - if constexpr(NeedIndices) - workspace_indices_dev_ = reinterpret_cast( - reinterpret_cast(workspace_dev_) + ws_buf2_bytes_offset); - else - workspace_indices_dev_ = nullptr; - } - - std::vector inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; - - AccDataType alpha_; - AccDataType beta_; - - const InDataType* in_dev_; - OutDataType* out_dev_; - IndexDataType* out_indices_dev_; - AccDataType* workspace_dev_; - IndexDataType* workspace_indices_dev_; - - InElementwiseOperation in_elementwise_op_; - AccElementwiseOperation acc_elementwise_op_; - - int invariant_lowest_length; - int reduce_lowest_length; - size_t invariant_total_length; - size_t reduce_total_length; - - index_t blkGroupSize; - index_t kBlockTileIterations; - size_t gridSize; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor( - arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); - const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor( - arg.invariant_total_length, arg.blkGroupSize); - using InGridDesc_M_K = decltype(in_grid_desc_m_k); - using WorkspaceDesc_M_K = decltype(ws_desc_m_k); - - using GridwiseReduce = - GridwiseReduction_mk_to_mk_multiblock_partial_reduce; - - float avg_time = 0; - - const auto kernel = kernel_partial_reduce_multiblock; - - avg_time = launch_and_time_kernel(stream_config, - kernel, - dim3(arg.gridSize), - dim3(BlockSize), - 0, - in_grid_desc_m_k, - ws_desc_m_k, - arg.in_elementwise_op_, - arg.acc_elementwise_op_, - arg.blkGroupSize, - arg.kBlockTileIterations, - arg.in_dev_, - arg.workspace_dev_, - arg.workspace_indices_dev_); - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if constexpr(OutDstVectorSize != 1) - return (false); - - if constexpr(InSrcVectorDim == 0) - { - if constexpr(NumInvariantDim == 0) - { - return (false); - } - else - { - if(pArg->inStrides_[NumInvariantDim - 1] != 1) - return (false); - - if(pArg->invariant_lowest_length % InSrcVectorSize != 0) - return (false); - }; - } - else - { - if(pArg->inStrides_[Rank - 1] != 1) - return (false); - - if(pArg->reduce_lowest_length % InSrcVectorSize != 0) - return (false); - }; - - // cases with small reduce_total_length should be handled by the BlockWise method - if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize) - return (false); - - return (true); - }; - - std::vector GetWorkspace2dLengths(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - return ( - std::vector{static_cast(pArg->invariant_total_length), pArg->blkGroupSize}); - }; - - std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, - float alpha, - float beta, - const void* in_dev, - void* out_dev, - void* out_indices_dev, - void* workspace_dev, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op) override - { - return std::make_unique(inLengths, - inStrides, - outLengths, - outStrides, - reduceDims, - alpha, - beta, - static_cast(in_dev), - static_cast(out_dev), - static_cast(out_indices_dev), - static_cast(workspace_dev), - in_elementwise_op, - acc_elementwise_op); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ","; - str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp index dacb175043..9549bf65d2 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp @@ -6,6 +6,7 @@ #include "device.hpp" #include "device_reduce.hpp" #include "device_reduce_common.hpp" +#include "gridwise_2d_reduction_multiblock.hpp" #include "gridwise_2d_reduction_threadwise.hpp" namespace ck { @@ -19,22 +20,19 @@ template -struct DeviceReduceThreadWise : public DeviceReduce +struct DeviceReduceThreadWise : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); - static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1), - "Threadwise can only be called with KThreadClusterSize be 1 !"); static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && @@ -43,7 +41,7 @@ struct DeviceReduceThreadWise : public DeviceReduce& inLengths, - const std::vector& inStrides) + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides) { const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); @@ -114,8 +112,8 @@ struct DeviceReduceThreadWise : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptor(const std::vector& outLengths, + const std::vector& outStrides) { const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); @@ -143,30 +141,26 @@ struct DeviceReduceThreadWise : public DeviceReduce inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, const std::vector reduceDims, float alpha, float beta, const InDataType* in_dev, OutDataType* out_dev, - IndexDataType* out_indices_dev, - AccDataType* workspace_dev, + IndexDataType* out_index_dev, const InElementwiseOperation in_elementwise_op, - const OutElementwiseOperation acc_elementwise_op) + const AccElementwiseOperation acc_elementwise_op) : outLengths_{outLengths}, outStrides_{outStrides}, in_dev_{in_dev}, out_dev_{out_dev}, - out_indices_dev_{out_indices_dev}, + out_index_dev_{out_index_dev}, in_elementwise_op_{in_elementwise_op}, acc_elementwise_op_{acc_elementwise_op} - { - (void)workspace_dev; - inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); @@ -183,30 +177,33 @@ struct DeviceReduceThreadWise : public DeviceReduce inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; AccDataType alpha_; AccDataType beta_; const InDataType* in_dev_; OutDataType* out_dev_; - IndexDataType* out_indices_dev_; + IndexDataType* out_index_dev_; InElementwiseOperation in_elementwise_op_; - OutElementwiseOperation acc_elementwise_op_; + AccElementwiseOperation acc_elementwise_op_; - int invariant_lowest_length; - int reduce_lowest_length; - size_t invariant_total_length; - size_t reduce_total_length; + index_t invariant_lowest_length; + index_t reduce_lowest_length; + long_index_t invariant_total_length; + long_index_t reduce_total_length; + int numBlockTileIteration; size_t gridSize; }; @@ -221,30 +218,30 @@ struct DeviceReduceThreadWise : public DeviceReduce; - float avg_time = 0; + using GridwiseReduce = + GridwiseReduction_mk_to_m_threadwise; + const auto kernel = kernel_reduce_threadwise; + AccElementwiseOperation>; avg_time = launch_and_time_kernel(stream_config, kernel, @@ -265,9 +262,10 @@ struct DeviceReduceThreadWise : public DeviceReduce(p_arg), stream_config); - } + }; }; bool IsSupportedArgument(const BaseArgument* p_arg) override @@ -311,9 +309,7 @@ struct DeviceReduceThreadWise : public DeviceReduceinvariant_lowest_length % OutDstVectorSize != 0) return (false); - // TODO: remove this. Should return true, as long as this DeviceOP instance support this - // case for bigger reduce_total_length size, we are supposed to use BlockWise method for - // better performance + // cases with big reduce_total_length should be handled by Blockwise kernel if(pArg->reduce_total_length / KThreadSliceSize >= 32) return (false); @@ -321,20 +317,22 @@ struct DeviceReduceThreadWise : public DeviceReduce - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, const std::vector reduceDims, float alpha, float beta, const void* in_dev, + const void* in_index_dev, void* out_dev, - void* out_indices_dev, - void* workspace_dev, + void* out_index_dev, const InElementwiseOperation in_elementwise_op, - const OutElementwiseOperation acc_elementwise_op) override + const AccElementwiseOperation acc_elementwise_op) override { + (void)in_index_dev; + return std::make_unique(inLengths, inStrides, outLengths, @@ -344,8 +342,7 @@ struct DeviceReduceThreadWise : public DeviceReduce(in_dev), static_cast(out_dev), - static_cast(out_indices_dev), - static_cast(workspace_dev), + static_cast(out_index_dev), in_elementwise_op, acc_elementwise_op); }; @@ -360,9 +357,9 @@ struct DeviceReduceThreadWise : public DeviceReduce"; // clang-format on diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp deleted file mode 100644 index 6826d5211c..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp +++ /dev/null @@ -1,886 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP -#define CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_accumulate.hpp" -#include "reduction_functions_blockwise.hpp" -#include "reduction_functions_threadwise.hpp" -#include "threadwise_tensor_slice_transfer.hpp" -#include "cluster_descriptor.hpp" -#include "element_wise_operation.hpp" - -namespace ck { - -template -__global__ void kernel_reduce_blockwise(const InGridDesc_M_K in_grid_desc_m_k, - const OutGridDesc_M out_grid_desc_m, - const InElementwiseOperation in_elementwise_op, - const OutElementwiseOperation acc_elementwise_op, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - AccDataType beta, - OutDataType* const __restrict__ p_out_global, - const IndexDataType* const __restrict__ p_ws_indices_global, - IndexDataType* const __restrict__ p_indices_global) -{ - if constexpr(!NeedIndices) - { - constexpr bool IsSecondCall = false; - - GridwiseReduction::template Run(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - alpha, - p_in_global, - beta, - p_out_global, - p_ws_indices_global, - p_indices_global); - } - else - { - GridwiseReduction::RunWithIndex(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - alpha, - p_in_global, - beta, - p_out_global, - p_ws_indices_global, - p_indices_global); - }; -}; - -template -__global__ void -kernel_reduce_blockwise_second_call(const InGridDesc_M_K in_grid_desc_m_k, - const OutGridDesc_M out_grid_desc_m, - const InElementwiseOperation in_elementwise_op, - const OutElementwiseOperation acc_elementwise_op, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - AccDataType beta, - OutDataType* const __restrict__ p_out_global, - const IndexDataType* const __restrict__ p_ws_indices_global, - IndexDataType* const __restrict__ p_indices_global) -{ - if constexpr(!NeedIndices) - { - constexpr bool IsSecondCall = true; - - GridwiseReduction::template Run(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - alpha, - p_in_global, - beta, - p_out_global, - p_ws_indices_global, - p_indices_global); - } - else - { - GridwiseReduction::RunSecondCallWithIndex(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - alpha, - p_in_global, - beta, - p_out_global, - p_ws_indices_global, - p_indices_global); - }; -}; - -template -struct GridwiseReduction_mk_to_m_blockwise -{ - static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || - (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && - (MThreadSliceSize % OutDstVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); - - using ThreadClusterLengths_M_K = Sequence; - - using ThreadBufferDimAccessOrder = - typename conditional, Sequence<0, 1>>::type; - - using ThreadClusterArrangeOrder = - typename conditional, Sequence<0, 1>>::type; - - static constexpr auto thread_cluster_desc = - make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - - using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}))); - using ThreadReduceDstDesc_M = - decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); - - using PassThroughOp = tensor_operation::element_wise::PassThrough; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - - template - __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, - const OutGridDesc_M& out_grid_desc_m, - const InElementwiseOperation& in_elementwise_op, - const OutElementwiseOperation& acc_elementwise_op, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - AccDataType beta, - OutDataType* const __restrict__ p_out_global, - const IndexDataType* const __restrict__ p_ws_indices_global, - IndexDataType* const __restrict__ p_indices_global) - { - if constexpr(IsSecondCall) - { - static_assert(InSrcVectorDim == 1, - "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!"); - }; - - using BlockwiseReduce = PartitionedBlockwiseReduction; - - using ThreadwiseReduce = ThreadwiseReduction; - - (void)p_ws_indices_global; - (void)p_indices_global; - - // LDS - __shared__ AccDataType p_reduce_work_buffer[BlockSize]; - - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - - const auto in_global_buf = make_dynamic_buffer( - p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_buf = make_dynamic_buffer( - p_out_global, out_grid_desc_m.GetElementSpaceSize()); - - auto reduce_work_buf = - make_dynamic_buffer(p_reduce_work_buffer, BlockSize); - - StaticBuffer - in_thread_buf; - - StaticBuffer accu_value_buf; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); - - const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - using ThreadBufferLengths = Sequence; - constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * KThreadSliceSize)); - - constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); - - const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize; - - index_t reducedTiles = 0; - do - { - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_buf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - in_elementwise_op(in_thread_buf(Number{}), - in_thread_buf(Number{})); - }); - }); - - ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); - - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - - reducedTiles++; - } while(reducedTiles < toReduceTiles); - - constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - - static_for<0, MThreadSliceSize, 1>{}( - [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if(thread_k_cluster_id == 0) - { - acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); - - accu_value_buf(I) *= alpha; - } - }); - - if(thread_k_cluster_id == 0) - { - if constexpr(!BetaIsZero) - { - if(!float_equal_zero{}(beta)) - { - StaticBuffer - priorDstValueBuf; - - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - OutDstVectorSize, - 1, - false>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize)); - - threadwise_dst_load.Run(out_grid_desc_m, - out_global_buf, - reduced_data_desc, - make_tuple(I0), - priorDstValueBuf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; - }); - }; - }; - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::Set, - 1, - true>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); - - threadwise_dst_store.Run( - reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); - } - }; - - __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, - const OutGridDesc_M& out_grid_desc_m, - const InElementwiseOperation& in_elementwise_op, - const OutElementwiseOperation& acc_elementwise_op, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - AccDataType beta, - OutDataType* const __restrict__ p_out_global, - const IndexDataType* const __restrict__ p_ws_indices_global, - IndexDataType* const __restrict__ p_indices_global) - { - using BlockwiseReduceWithIndex = - PartitionedBlockwiseReductionWithIndex; - - using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; - - (void)p_ws_indices_global; - - // LDS - __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; - __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; - - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - - const auto in_global_buf = make_dynamic_buffer( - p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_val_buf = make_dynamic_buffer( - p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto out_global_idx_buf = make_dynamic_buffer( - p_indices_global, out_grid_desc_m.GetElementSpaceSize()); - - auto reduce_work_val_buf = - make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); - auto reduce_work_idx_buf = - make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); - - StaticBuffer - in_thread_val_buf; - - StaticBuffer - in_thread_idx_buf; - - StaticBuffer accu_value_buf; - StaticBuffer accu_index_buf; - - const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - using ThreadBufferLengths = Sequence; - constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * KThreadSliceSize)); - - index_t indexOffset = 0; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) = zeroVal; - accu_index_buf(I) = 0; - }); - - constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); - - const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize; - - index_t reducedTiles = 0; - do - { - // load the thread slice - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_val_buf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - - // initialize the indices for the per-thread to-reduce values - in_thread_idx_buf(Number{}) = - indexOffset + thread_k_cluster_id * KThreadSliceSize + iK(); - - // do element-wise pre-reduction operation - in_elementwise_op(in_thread_val_buf(Number{}), - in_thread_val_buf(Number{})); - }); - - AccDataType tmpValue = zeroVal; - IndexDataType tmpIndex = 0; - - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - - AccumulationWithIndex::Calculate(tmpValue, - in_thread_val_buf[Number{}], - tmpIndex, - in_thread_idx_buf[Number{}]); - }); - - BlockwiseReduceWithIndex::Reduce( - reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); - - AccumulationWithIndex::Calculate( - accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); - }); - - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - - indexOffset += K_BlockTileSize; - reducedTiles++; - } while(reducedTiles < toReduceTiles); - - constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if(thread_k_cluster_id == 0) - { - // for indiced operation, acc_elementwise_op shoud do nothing - acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); - - accu_value_buf(I) *= alpha; - } - }); - - if(thread_k_cluster_id == 0) - { - if constexpr(!BetaIsZero) - { - if(!float_equal_zero{}(beta)) - { - StaticBuffer - priorDstValueBuf; - - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - OutDstVectorSize, - 1, - false>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize)); - - threadwise_dst_load.Run(out_grid_desc_m, - out_global_val_buf, - reduced_data_desc, - make_tuple(I0), - priorDstValueBuf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; - }); - }; - }; - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::Set, - 1, - false>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::Set, - 1, - false>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); - - threadwise_dst_val_store.Run(reduced_data_desc, - make_tuple(I0), - accu_value_buf, - out_grid_desc_m, - out_global_val_buf); - threadwise_dst_idx_store.Run(reduced_data_desc, - make_tuple(I0), - accu_index_buf, - out_grid_desc_m, - out_global_idx_buf); - } - }; - - __device__ static void - RunSecondCallWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, - const OutGridDesc_M& out_grid_desc_m, - const InElementwiseOperation in_elementwise_op, - const OutElementwiseOperation acc_elementwise_op, - AccDataType alpha, - const InDataType* const __restrict__ p_ws_values_global, - AccDataType beta, - OutDataType* const __restrict__ p_out_global, - const IndexDataType* const __restrict__ p_ws_indices_global, - IndexDataType* const __restrict__ p_indices_global) - { - static_assert(InSrcVectorDim == 1, - "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!"); - - using BlockwiseReduceWithIndex = - PartitionedBlockwiseReductionWithIndex, - ThreadClusterArrangeOrder, - ReduceOperation, - PropagateNan>; - - using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; - - (void)in_elementwise_op; - - // LDS - __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; - __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; - - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - - const auto src_global_val_buf = - make_dynamic_buffer(p_ws_values_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - p_ws_indices_global, in_grid_desc_m_k.GetElementSpaceSize()); - auto out_global_val_buf = make_dynamic_buffer( - p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto out_global_idx_buf = make_dynamic_buffer( - p_indices_global, out_grid_desc_m.GetElementSpaceSize()); - - auto reduce_work_val_buf = - make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); - auto reduce_work_idx_buf = - make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); - - StaticBuffer - in_thread_val_buf; - - StaticBuffer - in_thread_idx_buf; - - StaticBuffer accu_value_buf; - StaticBuffer accu_index_buf; - - const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - using ThreadBufferLengths = Sequence; - constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto threadwise_src_val_load = - ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * KThreadSliceSize)); - - auto threadwise_src_idx_load = - ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * KThreadSliceSize)); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) = zeroVal; - accu_index_buf(I) = 0; - }); - - constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); - - const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize; - - index_t reducedTiles = 0; - do - { - // load the thread slice - threadwise_src_val_load.Run(in_grid_desc_m_k, - src_global_val_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_val_buf); - threadwise_src_idx_load.Run(in_grid_desc_m_k, - src_global_idx_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_idx_buf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - AccDataType tmpValue = zeroVal; - IndexDataType tmpIndex = 0; - - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - - AccumulationWithIndex::Calculate(tmpValue, - in_thread_val_buf[Number{}], - tmpIndex, - in_thread_idx_buf[Number{}]); - }); - - BlockwiseReduceWithIndex::Reduce( - reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); - - AccumulationWithIndex::Calculate( - accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); - }); - - threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - - reducedTiles++; - } while(reducedTiles < toReduceTiles); - - constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if(thread_k_cluster_id == 0) - { - // for indiced operation, acc_elementwise_op shoud do nothing - acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); - - accu_value_buf(I) *= alpha; - } - }); - - if(thread_k_cluster_id == 0) - { - if constexpr(!BetaIsZero) - { - if(!float_equal_zero{}(beta)) - { - StaticBuffer - priorDstValueBuf; - - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - OutDstVectorSize, - 1, - true>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize)); - - threadwise_dst_load.Run(out_grid_desc_m, - out_global_val_buf, - reduced_data_desc, - make_tuple(I0), - priorDstValueBuf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; - }); - }; - }; - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::Set, - 1, - true>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::Set, - 1, - true>( - out_grid_desc_m, - make_multi_index(block_global_1d_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); - - threadwise_dst_val_store.Run(reduced_data_desc, - make_tuple(I0), - accu_value_buf, - out_grid_desc_m, - out_global_val_buf); - threadwise_dst_idx_store.Run(reduced_data_desc, - make_tuple(I0), - accu_index_buf, - out_grid_desc_m, - out_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp new file mode 100644 index 0000000000..f3e9836d4f --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp @@ -0,0 +1,638 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP +#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_accumulate.hpp" +#include "reduction_functions_blockwise.hpp" +#include "reduction_functions_threadwise.hpp" + +#include "threadwise_tensor_slice_transfer.hpp" +#include "element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + AccDataType alpha, + const InDataType* const __restrict__ p_in_value_global, + const IndexDataType* const __restrict__ p_in_index_global, + AccDataType beta, + OutDataType* const __restrict__ p_out_value_global, + IndexDataType* const __restrict__ p_out_index_global) +{ + if constexpr(!OutputIndex) + { + (void)p_in_index_global; + (void)p_out_index_global; + + GridwiseReduction::Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + block_group_size, + num_k_block_tile_iteration, + alpha, + p_in_value_global, + beta, + p_out_value_global); + } + else + { + GridwiseReduction::template RunWithIndex(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + num_k_block_tile_iteration, + alpha, + p_in_value_global, + p_in_index_global, + beta, + p_out_value_global, + p_out_index_global); + }; +}; + +template +struct GridwiseReduction_mk_to_m_multiblock +{ + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && + (MThreadSliceSize % OutDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using BlockwiseReduce = PartitionedBlockwiseReduction; + + using ThreadwiseReduce = ThreadwiseReduction; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + using Accumulation = detail::AccumulateWithNanCheck; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + AccDataType alpha, + const InDataType* const __restrict__ p_in_value_global, + AccDataType beta, + OutDataType* const __restrict__ p_out_value_global) + { + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + // LDS + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; + + const auto in_global_val_buf = + make_dynamic_buffer(p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + auto out_global_val_buf = make_dynamic_buffer( + p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); + + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); + + StaticBuffer + in_thread_buf; + + StaticBuffer accu_value_buf; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / block_group_size; + const index_t block_local_id = block_global_id % block_group_size; + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + block_local_id * reduceSizePerBlock + + thread_k_cluster_id * KThreadSliceSize)); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + index_t reducedTiles = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op(in_thread_buf(Number{}), + in_thread_buf(Number{})); + }); + }); + + ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; + + static_for<0, MThreadSliceSize, 1>{}( + [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + } + }); + + if(thread_k_cluster_id == 0) + { + if(block_group_size == 0 && !float_equal_zero{}(beta)) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSize, + 1, + false>( + out_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m, + out_global_val_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; + }); + }; + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + OutDstVectorSize, + OutMemoryDataOperation, + 1, + true>( + out_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_store.Run(reduced_data_desc, + make_tuple(I0), + accu_value_buf, + out_grid_desc_m, + out_global_val_buf); + } + }; + + template + __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op, + index_t num_k_block_tile_iteration, + AccDataType alpha, + const InDataType* const __restrict__ p_in_value_global, + const IndexDataType* const __restrict__ p_in_index_global, + AccDataType beta, + OutDataType* const __restrict__ p_out_value_global, + IndexDataType* const __restrict__ p_out_index_global) + { + using BlockwiseReduceWithIndex = + PartitionedBlockwiseReductionWithIndex, + ThreadClusterArrangeOrder, + ReduceOperation, + PropagateNan>; + + using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; + + (void)in_elementwise_op; + + // LDS + __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; + __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + const auto in_global_val_buf = + make_dynamic_buffer(p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + const auto in_global_idx_buf = make_dynamic_buffer( + p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize()); + auto out_global_val_buf = make_dynamic_buffer( + p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); + auto out_global_idx_buf = make_dynamic_buffer( + p_out_index_global, out_grid_desc_m.GetElementSpaceSize()); + + auto reduce_work_val_buf = + make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); + auto reduce_work_idx_buf = + make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); + + StaticBuffer + in_thread_val_buf; + + StaticBuffer + in_thread_idx_buf; + + StaticBuffer accu_value_buf; + StaticBuffer accu_index_buf; + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_val_load = + ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) = zeroVal; + accu_index_buf(I) = 0; + }); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + index_t reducedTiles = 0; + + if constexpr(HaveIndexInput) + { + auto threadwise_src_idx_load = + ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + do + { + // load the thread slice + threadwise_src_val_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); + threadwise_src_idx_load.Run(in_grid_desc_m_k, + in_global_idx_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_idx_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + AccDataType tmpValue = zeroVal; + IndexDataType tmpIndex = 0; + + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + + AccumulationWithIndex::Calculate(tmpValue, + in_thread_val_buf[Number{}], + tmpIndex, + in_thread_idx_buf[Number{}]); + }); + + BlockwiseReduceWithIndex::Reduce( + reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); + + AccumulationWithIndex::Calculate( + accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); + }); + + threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + } + else + { + index_t indexOffset = 0; + + do + { + // load the thread slice + threadwise_src_val_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + + // initialize the indices for the per-thread to-reduce values + in_thread_idx_buf(Number{}) = + indexOffset + thread_k_cluster_id * KThreadSliceSize + iK(); + + // do element-wise pre-reduction operation + in_elementwise_op(in_thread_val_buf(Number{}), + in_thread_val_buf(Number{})); + }); + + AccDataType tmpValue = zeroVal; + IndexDataType tmpIndex = 0; + + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + + AccumulationWithIndex::Calculate(tmpValue, + in_thread_val_buf[Number{}], + tmpIndex, + in_thread_idx_buf[Number{}]); + }); + + BlockwiseReduceWithIndex::Reduce( + reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); + + AccumulationWithIndex::Calculate( + accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); + }); + + threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + indexOffset += K_BlockTileSize; + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + }; + + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + // for indiced operation, acc_elementwise_op shoud do nothing + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + } + }); + + if(thread_k_cluster_id == 0) + { + if(!float_equal_zero{}(beta)) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSize, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m, + out_global_val_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; + }); + }; + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum::Set, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum::Set, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_val_store.Run(reduced_data_desc, + make_tuple(I0), + accu_value_buf, + out_grid_desc_m, + out_global_val_buf); + threadwise_dst_idx_store.Run(reduced_data_desc, + make_tuple(I0), + accu_index_buf, + out_grid_desc_m, + out_global_idx_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp deleted file mode 100644 index 4e325f3573..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp +++ /dev/null @@ -1,269 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP -#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_accumulate.hpp" -#include "reduction_functions_blockwise.hpp" -#include "reduction_functions_threadwise.hpp" - -#include "threadwise_tensor_slice_transfer.hpp" -#include "element_wise_operation.hpp" - -namespace ck { - -template -__global__ void -kernel_reduce_multiblock_atocmi_add(const InGridDesc_M_K in_grid_desc_m_k, - const OutGridDesc_M out_grid_desc_m, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op, - index_t block_group_size, - index_t num_k_block_tile_iteration, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - OutDataType* const __restrict__ p_out_global) -{ - GridwiseReduction::Run(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - block_group_size, - num_k_block_tile_iteration, - alpha, - p_in_global, - p_out_global); -}; - -template -struct GridwiseReduction_mk_to_m_multiblock_atomic_add -{ - static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || - (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && - (MThreadSliceSize % OutDstVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); - - using ThreadClusterLengths_M_K = Sequence; - - using ThreadBufferDimAccessOrder = - typename conditional, Sequence<0, 1>>::type; - - using ThreadClusterArrangeOrder = - typename conditional, Sequence<0, 1>>::type; - - static constexpr auto thread_cluster_desc = - make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - - using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}))); - using ThreadReduceDstDesc_M = - decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); - - using BlockwiseReduce = PartitionedBlockwiseReduction; - - using ThreadwiseReduce = ThreadwiseReduction; - - using PassThroughOp = tensor_operation::element_wise::PassThrough; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - - using Accumulation = detail::AccumulateWithNanCheck; - - __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, - const OutGridDesc_M& out_grid_desc_m, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op, - index_t block_group_size, - index_t num_k_block_tile_iteration, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - OutDataType* const __restrict__ p_out_global) - { - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - - // LDS - __shared__ AccDataType p_reduce_work_buffer[BlockSize]; - - const auto in_global_buf = make_dynamic_buffer( - p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_buf = make_dynamic_buffer( - p_out_global, out_grid_desc_m.GetElementSpaceSize()); - - auto reduce_work_buf = - make_dynamic_buffer(p_reduce_work_buffer, BlockSize); - - StaticBuffer - in_thread_buf; - - StaticBuffer accu_value_buf; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / block_group_size; - const index_t block_local_id = block_global_id % block_group_size; - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; - - using ThreadBufferLengths = Sequence; - constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, - block_local_id * reduceSizePerBlock + - thread_k_cluster_id * KThreadSliceSize)); - - constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); - - index_t reducedTiles = 0; - do - { - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_buf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - in_elementwise_op(in_thread_buf(Number{}), - in_thread_buf(Number{})); - }); - }); - - ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); - - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - - reducedTiles++; - } while(reducedTiles < num_k_block_tile_iteration); - - constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - - // Each block executes multiple parallel reductions on the LDS, and by atomic-adding its - // reduced output to the global location corresponding to each invariant dimension to get a - // consistent reduced result for that invariant dimension. due to the using of vector_load, - // each block/thread is involved into multiple invarirant dimensions. - static_for<0, MThreadSliceSize, 1>{}( - [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if(thread_k_cluster_id == 0) - { - acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); - - accu_value_buf(I) *= alpha; - } - }); - - if(thread_k_cluster_id == 0) - { - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::AtomicAdd, - 1, - true>( - out_grid_desc_m, - make_multi_index(blkgroup_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); - - threadwise_dst_store.Run( - reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp deleted file mode 100644 index d1be1f5275..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp +++ /dev/null @@ -1,487 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP -#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_accumulate.hpp" -#include "reduction_functions_blockwise.hpp" -#include "reduction_functions_threadwise.hpp" -#include "threadwise_tensor_slice_transfer.hpp" -#include "cluster_descriptor.hpp" -#include "element_wise_operation.hpp" - -namespace ck { - -template -__global__ void -kernel_partial_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k, - const WorkspaceDesc_M_K workspace_desc_m_k, - const InElementwiseOperation in_elementwise_op, - const AccElementwiseOperation acc_elementwise_op, - index_t block_group_size, - index_t num_k_block_tile_iteration, - const InDataType* const __restrict__ p_src_global, - AccDataType* const __restrict__ p_ws_values_global, - IndexDataType* const __restrict__ p_ws_indices_global) - -{ - if constexpr(!NeedIndices) - { - GridwiseReduction::Run(in_grid_desc_m_k, - workspace_desc_m_k, - in_elementwise_op, - acc_elementwise_op, - block_group_size, - num_k_block_tile_iteration, - p_src_global, - p_ws_values_global, - p_ws_indices_global); - } - else - { - GridwiseReduction::RunWithIndex(in_grid_desc_m_k, - workspace_desc_m_k, - in_elementwise_op, - acc_elementwise_op, - block_group_size, - num_k_block_tile_iteration, - p_src_global, - p_ws_values_global, - p_ws_indices_global); - }; -}; - -template -struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce -{ - static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || - (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!"); - - static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); - - using ThreadClusterLengths_M_K = Sequence; - - using ThreadBufferDimAccessOrder = - typename conditional, Sequence<0, 1>>::type; - - using ThreadClusterArrangeOrder = - typename conditional, Sequence<0, 1>>::type; - - static constexpr auto thread_cluster_desc = - make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - - using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}))); - using ThreadReduceDstDesc_M = - decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); - - using PassThroughOp = tensor_operation::element_wise::PassThrough; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - - __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, - const WorkspaceDesc_M_K& workspace_desc_m_k, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op, - index_t block_group_size, - index_t num_k_block_tile_iteration, - const InDataType* const __restrict__ p_src_global, - AccDataType* const __restrict__ p_ws_values_global, - IndexDataType* const __restrict__ p_ws_indices_global) - { - using BlockwiseReduce = PartitionedBlockwiseReduction; - - using ThreadwiseReduce = ThreadwiseReduction; - - (void)p_ws_indices_global; - (void)acc_elementwise_op; - - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - - // LDS - __shared__ AccDataType p_reduce_work_buffer[BlockSize]; - - const auto in_global_buf = - make_dynamic_buffer(p_src_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(zeroVal)); - auto workspace_global_buf = make_dynamic_buffer( - p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); - - auto reduce_work_buf = - make_dynamic_buffer(p_reduce_work_buffer, BlockSize); - - StaticBuffer - in_thread_buf; - - StaticBuffer accu_value_buf; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / block_group_size; - const index_t block_local_id = block_global_id % block_group_size; - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; - - using ThreadBufferLengths = Sequence; - constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, - block_local_id * reduceSizePerBlock + - thread_k_cluster_id * KThreadSliceSize)); - - constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); - - index_t reducedTiles = 0; - do - { - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_buf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - in_elementwise_op(in_thread_buf(Number{}), - in_thread_buf(Number{})); - }); - }); - - ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); - - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - - reducedTiles++; - } while(reducedTiles < num_k_block_tile_iteration); - - // Each block executes multiple parallel reductions on the LDS, and due to the using of - // vector_load, each block/thread is involved into multiple invarirant dimensions. - static_for<0, MThreadSliceSize, 1>{}( - [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); - - constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number<1>{})); - - if(thread_k_cluster_id == 0) - { - auto threadwise_workspace_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0, 1>, - 1, - 1, - InMemoryDataOperationEnum::Set, - 1, - true>( - workspace_desc_m_k, - make_multi_index(blkgroup_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - block_local_id), - PassThroughOp{}); - - threadwise_workspace_store.Run(reduced_data_desc, - make_tuple(I0, I0), - accu_value_buf, - workspace_desc_m_k, - workspace_global_buf); - } - }; - - __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, - const WorkspaceDesc_M_K& workspace_desc_m_k, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op, - index_t block_group_size, - index_t num_k_block_tile_iteration, - const InDataType* const __restrict__ p_src_global, - AccDataType* const __restrict__ p_ws_values_global, - IndexDataType* const __restrict__ p_ws_indices_global) - { - using BlockwiseReduceWithIndex = - PartitionedBlockwiseReductionWithIndex; - - using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; - - (void)acc_elementwise_op; - - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - - // LDS - __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; - __shared__ index_t p_reduce_work_idx_buffer[BlockSize]; - - const auto in_global_buf = - make_dynamic_buffer(p_src_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(zeroVal)); - auto workspace_global_val_buf = make_dynamic_buffer( - p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); - auto workspace_global_idx_buf = make_dynamic_buffer( - p_ws_indices_global, workspace_desc_m_k.GetElementSpaceSize()); - - auto reduce_work_val_buf = - make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); - auto reduce_work_idx_buf = - make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); - - StaticBuffer - in_thread_val_buf; - StaticBuffer - in_thread_idx_buf; - - StaticBuffer accu_value_buf; - StaticBuffer accu_index_buf; - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / block_group_size; - const index_t block_local_id = block_global_id % block_group_size; - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; - - using ThreadBufferLengths = Sequence; - constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, - make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, - block_local_id * reduceSizePerBlock + - thread_k_cluster_id * KThreadSliceSize)); - - constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); - - index_t indexOffset = block_local_id * reduceSizePerBlock; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) = zeroVal; - accu_index_buf(I) = 0; - }); - - index_t reducedTiles = 0; - do - { - // load the thread slice - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_val_buf); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - - // initialize the indices for the per-thread to-reduce values - in_thread_idx_buf(Number{}) = - indexOffset + thread_k_cluster_id * KThreadSliceSize + iK(); - - // do element-wise pre-reduction operation - in_elementwise_op(in_thread_val_buf(Number{}), - in_thread_val_buf(Number{})); - }); - - AccDataType tmpValue = zeroVal; - IndexDataType tmpIndex = 0; - - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - - AccumulationWithIndex::Calculate(tmpValue, - in_thread_val_buf[Number{}], - tmpIndex, - in_thread_idx_buf[Number{}]); - }); - - BlockwiseReduceWithIndex::Reduce( - reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); - - AccumulationWithIndex::Calculate( - accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); - }); - - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - - indexOffset += K_BlockTileSize; - - reducedTiles++; - } while(reducedTiles < num_k_block_tile_iteration); - - constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number<1>{})); - - if(thread_k_cluster_id == 0) - { - auto threadwise_workspace_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0, 1>, - 1, - 1, - InMemoryDataOperationEnum::Set, - 1, - true>( - workspace_desc_m_k, - make_multi_index(blkgroup_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - block_local_id), - PassThroughOp{}); - - auto threadwise_workspace_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0, 1>, - 1, - 1, - InMemoryDataOperationEnum::Set, - 1, - true>( - workspace_desc_m_k, - make_multi_index(blkgroup_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - block_local_id), - PassThroughOp{}); - - threadwise_workspace_val_store.Run(reduced_data_desc, - make_tuple(I0, I0), - accu_value_buf, - workspace_desc_m_k, - workspace_global_val_buf); - threadwise_workspace_idx_store.Run(reduced_data_desc, - make_tuple(I0, I0), - accu_index_buf, - workspace_desc_m_k, - workspace_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp index c047f7e375..ff01b88146 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp @@ -37,7 +37,8 @@ namespace ck { template (in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_value_global, + p_in_index_global, + beta, + p_out_value_global, + p_out_index_global); }; }; @@ -91,11 +93,9 @@ template ; - (void)p_indices_global; - const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - const auto in_global_buf = make_dynamic_buffer( - p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + const auto in_global_val_buf = + make_dynamic_buffer(p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); auto dst_global_buf = make_dynamic_buffer( - p_out_global, out_grid_desc_m.GetElementSpaceSize()); + p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); StaticBuffer in_thread_buf; @@ -160,28 +159,29 @@ struct GridwiseReduction_mk_to_m_threadwise index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); + auto threadwise_src_val_load = + ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); index_t reducedLength = 0; do { - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_buf); + threadwise_src_val_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // do element-wise pre-reduction operation @@ -194,7 +194,7 @@ struct GridwiseReduction_mk_to_m_threadwise ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); reducedLength += KThreadSliceSize; } while(reducedLength < toReduceLength); @@ -207,68 +207,65 @@ struct GridwiseReduction_mk_to_m_threadwise constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - if constexpr(!BetaIsZero) + if(!float_equal_zero{}(beta)) { - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>( - out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); + auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>( + out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); - StaticBuffer - priorDstValue_buf; + StaticBuffer + priorDstValue_buf; - threadwise_dst_load.Run(out_grid_desc_m, - dst_global_buf, - reduced_data_desc, - make_tuple(I0), - priorDstValue_buf); + threadwise_dst_load.Run(out_grid_desc_m, + dst_global_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValue_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValue_buf[I]) * beta; - }); - }; + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValue_buf[I]) * beta; + }); }; - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - OutDstVectorSize, - InMemoryDataOperationEnum::Set, - 1, - false>( - out_grid_desc_m, - make_multi_index(thread_global_1d_id * MThreadSliceSize), - PassThroughOp{}); + auto threadwise_dst_store = ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + OutDstVectorSize, + OutMemoryDataOperation, + 1, + false>( + out_grid_desc_m, + make_multi_index(thread_global_1d_id * MThreadSliceSize), + PassThroughOp{}); threadwise_dst_store.Run( reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf); }; - __device__ static void RunWithIndices(const InGridDesc_M_K& in_grid_desc_m_k, - const OutGridDesc_M& out_grid_desc_m, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op, - AccDataType alpha, - const InDataType* const __restrict__ p_in_global, - AccDataType beta, - OutDataType* const __restrict__ p_out_global, - IndexDataType* const __restrict__ p_indices_global) + template + __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_value_global, + const IndexDataType* const __restrict__ p_in_index_global, + AccDataType beta, + OutDataType* const __restrict__ p_out_value_global, + IndexDataType* const __restrict__ p_out_index_global) { using ThreadwiseReduceWithIndex = ThreadwiseReductionWithIndex( - p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + const auto in_global_val_buf = + make_dynamic_buffer(p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + const auto in_global_idx_buf = make_dynamic_buffer( + p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize()); + auto out_global_val_buf = make_dynamic_buffer( - p_out_global, out_grid_desc_m.GetElementSpaceSize()); + p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); auto out_global_idx_buf = make_dynamic_buffer( - p_indices_global, out_grid_desc_m.GetElementSpaceSize()); + p_out_index_global, out_grid_desc_m.GetElementSpaceSize()); StaticBuffer in_thread_val_buf; @@ -313,50 +315,105 @@ struct GridwiseReduction_mk_to_m_threadwise index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( - in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); + auto threadwise_src_val_load = + ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); index_t indexStart = 0; index_t reducedLength = 0; - do + if constexpr(HaveIndexInput) { - threadwise_src_load.Run(in_grid_desc_m_k, - in_global_buf, - thread_buffer_desc, - make_tuple(I0, I0), - in_thread_val_buf); + auto threadwise_src_idx_load = + ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + do + { + threadwise_src_val_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); - in_thread_idx_buf(Number{}) = indexStart + iK(); + threadwise_src_idx_load.Run(in_grid_desc_m_k, + in_global_idx_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_idx_buf); - in_elementwise_op(in_thread_val_buf(Number{}), - in_thread_val_buf(Number{})); + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + + in_elementwise_op(in_thread_val_buf(Number{}), + in_thread_val_buf(Number{})); + }); }); - }); - ThreadwiseReduceWithIndex::Reduce( - in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf); + ThreadwiseReduceWithIndex::Reduce( + in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf); - threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - indexStart += KThreadSliceSize; - reducedLength += KThreadSliceSize; - } while(reducedLength < toReduceLength); + indexStart += KThreadSliceSize; + reducedLength += KThreadSliceSize; + } while(reducedLength < toReduceLength); + } + else + { + do + { + threadwise_src_val_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + + in_thread_idx_buf(Number{}) = indexStart + iK(); + + in_elementwise_op(in_thread_val_buf(Number{}), + in_thread_val_buf(Number{})); + }); + }); + + ThreadwiseReduceWithIndex::Reduce( + in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf); + + threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + indexStart += KThreadSliceSize; + reducedLength += KThreadSliceSize; + } while(reducedLength < toReduceLength); + }; // for indiced operation, acc_elementwise_op shoud do nothing static_for<0, MThreadSliceSize, 1>{}([&](auto I) { @@ -367,36 +424,32 @@ struct GridwiseReduction_mk_to_m_threadwise constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - if constexpr(!BetaIsZero) + if(!float_equal_zero{}(beta)) { - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>( - out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); + auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + false>( + out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); - StaticBuffer - priorDstValue_buf; + StaticBuffer + priorDstValue_buf; - threadwise_dst_load.Run(out_grid_desc_m, - out_global_val_buf, - reduced_data_desc, - make_tuple(I0), - priorDstValue_buf); + threadwise_dst_load.Run(out_grid_desc_m, + out_global_val_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValue_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValue_buf[I]) * beta; - }); - }; + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValue_buf[I]) * beta; + }); }; auto threadwise_dst_val_store = @@ -409,7 +462,7 @@ struct GridwiseReduction_mk_to_m_threadwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum::Set, + OutMemoryDataOperation, 1, false>( out_grid_desc_m, @@ -426,7 +479,7 @@ struct GridwiseReduction_mk_to_m_threadwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum::Set, + OutMemoryDataOperation, 1, false>( out_grid_desc_m, diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 0ad78423fe..5e81c6a469 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -325,7 +325,7 @@ struct DynamicBuffer { if(is_valid_element) { - atomic_add(c_style_pointer_cast(&p_data_[i]), x); + atomic_add(c_style_pointer_cast(&p_data_[i]), x); } } } diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp index 712d815f52..1a2dacb5c5 100644 --- a/include/ck/utility/generic_memory_space_atomic.hpp +++ b/include/ck/utility/generic_memory_space_atomic.hpp @@ -28,6 +28,12 @@ __device__ float atomic_add(float* p_dst, const float& x) return atomicAdd(p_dst, x); } +template <> +__device__ double atomic_add(double* p_dst, const double& x) +{ + return atomicAdd(p_dst, x); +} + template <> __device__ float2_t atomic_add(float2_t* p_dst, const float2_t& x) { @@ -45,6 +51,23 @@ __device__ float2_t atomic_add(float2_t* p_dst, const float2_t& x) return vy.template AsType()[I0]; } +template <> +__device__ double2_t atomic_add(double2_t* p_dst, const double2_t& x) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + const vector_type vx{x}; + vector_type vy{0}; + + vy.template AsType()(I0) = + atomicAdd(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); + vy.template AsType()(I1) = + atomicAdd(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); + + return vy.template AsType()[I0]; +} + // Caution: DO NOT REMOVE // intentionally have only declaration but no definition to cause compilation failure when trying to // instantiate this template. The purpose is to make the implementation of atomic_max explicit for diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp index 5893f60547..e7a8db8c01 100644 --- a/include/ck/utility/reduction_operator.hpp +++ b/include/ck/utility/reduction_operator.hpp @@ -26,7 +26,8 @@ #ifndef CK_REDUCTION_OPERATOR_HPP #define CK_REDUCTION_OPERATOR_HPP -#include "common_header.hpp" +#include "config.hpp" +#include "data_type.hpp" namespace ck { @@ -41,12 +42,10 @@ namespace reduce { // when operated against them, and the concept is similar to zero vector in // vector space // (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf). -// 2) indexable -- boolean value indicating whether indices of the operated elements could be -// recorded. Usually, Min/Max operator could -// need to record the indices of elements. For operator like Add/Mul, no need to -// record the indices. -// 3) operator() -- the first argument of the operator must be both an input & output, and the -// corresponding variable usually stores +// 2) IsCompatibleInMemoryDataOperation() -- return true if the reduction task corresponding to this +// operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() -- +// the first argument of the operator must be both an input & output, and the corresponding variable +// usually stores // the accumulated result of many operator() calls; the second argument is only an // input. For indexable binary // operator, the second version of operator() has third argument (which is an @@ -62,6 +61,13 @@ struct Add __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; + __device__ static constexpr bool + IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) + { + return operation == InMemoryDataOperationEnum::AtomicAdd || + operation == InMemoryDataOperationEnum::Set; + }; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } }; @@ -72,6 +78,12 @@ struct Mul __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast(1.0f); }; + __device__ static constexpr bool + IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) + { + return operation == InMemoryDataOperationEnum::Set; + }; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } }; @@ -85,6 +97,13 @@ struct Max return NumericLimits::Lowest(); }; + __device__ static constexpr bool + IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) + { + // ToChange: atomic_max to be added + return operation == InMemoryDataOperationEnum::Set; + }; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { if(a < b) @@ -111,6 +130,13 @@ struct Min return NumericLimits::Max(); }; + __device__ static constexpr bool + IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) + { + // ToChange: atomic_min to be added + return operation == InMemoryDataOperationEnum::Set; + }; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { if(a > b) @@ -134,6 +160,13 @@ struct AMax __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; + __device__ static constexpr bool + IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) + { + // ToChange: atomic_max to be added + return operation == InMemoryDataOperationEnum::Set; + }; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { if(a < b) @@ -150,6 +183,17 @@ struct AMax } }; +template +T GetReductionZeroValueForInMemoryDataOperation(InMemoryDataOperationEnum operation) +{ + T result = ck::type_convert(0.0f); + + if(operation == InMemoryDataOperationEnum::AtomicMax) + result = ck::NumericLimits::Lowest(); + + return (result); +}; + }; // end of namespace reduce } // end of namespace ck diff --git a/library/include/ck/library/host_tensor/host_common_util.hpp b/library/include/ck/library/host_tensor/host_common_util.hpp new file mode 100644 index 0000000000..8fc1d36430 --- /dev/null +++ b/library/include/ck/library/host_tensor/host_common_util.hpp @@ -0,0 +1,102 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_HOST_COMMON_UTIL_HPP +#define GUARD_HOST_COMMON_UTIL_HPP + +#include +#include +#include +#include + +#include "config.hpp" + +namespace ck { + +namespace host_common { + +template +static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) +{ + std::ofstream outFile(fileName, std::ios::binary); + if(outFile) + { + outFile.write(reinterpret_cast(data), dataNumItems * sizeof(T)); + outFile.close(); + std::cout << "Write output to file " << fileName << std::endl; + } + else + { + std::cout << "Could not open file " << fileName << " for writing" << std::endl; + } +}; + +template +static inline T getSingleValueFromString(const std::string& valueStr) +{ + std::istringstream iss(valueStr); + + T val; + + iss >> val; + + return (val); +}; + +template +static inline std::vector getTypeValuesFromString(const char* cstr_values) +{ + std::string valuesStr(cstr_values); + + std::vector values; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = valuesStr.find(',', pos); + while(new_pos != std::string::npos) + { + const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); + + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + pos = new_pos + 1; + new_pos = valuesStr.find(',', pos); + }; + + std::string sliceStr = valuesStr.substr(pos); + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + return (values); +} + +}; // namespace host_common + +}; // namespace ck + +#endif diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp index 53e17bcb5c..095bb03426 100644 --- a/library/include/ck/library/host_tensor/host_reduce_util.hpp +++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp @@ -28,9 +28,7 @@ #include #include -#include -#include -#include +#include #include "reduction_enums.hpp" #include "data_type.hpp" @@ -214,13 +212,13 @@ binop_with_nan_check(std::function opReduce, }; }; -template +template __host__ static inline void -binop_with_nan_check2(std::function opReduce, - AccDataType& accuVal, - AccDataType currVal, - int& accuIndex, - int currIndex) +binop_with_index_and_nan_check(std::function opReduce, + AccDataType& accuVal, + AccDataType currVal, + IndexDataType& accuIndex, + IndexDataType currIndex) { using ck::math::isnan; @@ -254,16 +252,6 @@ binop_with_nan_check2(std::function opRe }; // namespace host_reduce -static inline std::vector to_int_vector(const std::vector& inData) -{ - std::vector outData; - - for(auto elem : inData) - outData.push_back(static_cast(elem)); - - return (outData); -}; - }; // namespace ck #endif diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp index b67f794505..1add62d1b5 100644 --- a/library/include/ck/library/host_tensor/host_reduction.hpp +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -34,6 +34,7 @@ #include "reduction_enums.hpp" #include "reduction_common.hpp" #include "host_reduce_util.hpp" +#include "host_common_util.hpp" #include "host_tensor.hpp" #include "data_type.hpp" @@ -200,7 +201,7 @@ struct ReductionHost using ck::float_equal_one; using ck::float_equal_zero; using ck::type_convert; - using ck::host_reduce::binop_with_nan_check2; + using ck::host_reduce::binop_with_index_and_nan_check; using ck::host_reduce::ReduceOpFn2; using ck::host_reduce::ReduceOpZeroVal; @@ -211,8 +212,7 @@ struct ReductionHost AccDataType accuVal = ReduceOpZeroVal(); IndexDataType accuIndex = 0; - for(IndexDataType i = 0; i < ck::type_convert(reduce_dim_indexes.size()); - i++) + for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++) { auto offset_reduce = get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); @@ -221,9 +221,9 @@ struct ReductionHost preUnaryOp(currVal); - auto currIndex = i; + auto currIndex = static_cast(i); - binop_with_nan_check2( + binop_with_index_and_nan_check( opReduce2, accuVal, currVal, accuIndex, currIndex); }; @@ -247,9 +247,7 @@ struct ReductionHost auto offset_invariant = get_offset_from_index(invariantStrides, invariant_index); - for(IndexDataType i = 0; - i < ck::type_convert(reduce_dim_indexes.size()); - i++) + for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++) { auto offset_reduce = get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); @@ -259,9 +257,9 @@ struct ReductionHost preUnaryOp(currVal); - auto currIndex = i; + auto currIndex = static_cast(i); - binop_with_nan_check2( + binop_with_index_and_nan_check( opReduce2, accuVal, currVal, accuIndex, currIndex); }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp index fafbe120b9..6f0dbe75ff 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp @@ -9,26 +9,11 @@ #include "device_reduce_instance_blockwise_i8_i8_i8.hpp" #include "device_reduce_instance_blockwise_i8_i32_i8.hpp" #include "device_reduce_instance_blockwise_b16_f32_b16.hpp" -#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp" -#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp" -#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp" -#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp" -#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp" -#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp" -#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp" -#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp" #include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp" +#include "device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp" #include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp" -#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp" #include "device_reduce_instance_threadwise_f16_f16_f16.hpp" #include "device_reduce_instance_threadwise_f16_f32_f16.hpp" #include "device_reduce_instance_threadwise_f32_f32_f32.hpp" diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index e4b06cf96d..e31d4e769e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -3,13 +3,27 @@ #include "reduction_operator_mapping.hpp" #include "device_reduce_instance_impl_common.hpp" -#include "device_reduce_blockwise.hpp" +#include "device_reduce_multiblock.hpp" namespace ck { namespace tensor_operation { namespace device { namespace device_reduce_instance { +using reduce_configuration_1_instances_blockwise = std::tuple< + // clang-format off + // BlockSize | MThreadClusterSize | KThreadClusterSize + ReductionConfiguration_1<256, 128, 2>, + ReductionConfiguration_1<256, 64, 4>, + ReductionConfiguration_1<256, 32, 8>, + ReductionConfiguration_1<256, 16, 16>, + ReductionConfiguration_1<256, 8, 32>, + ReductionConfiguration_1<256, 4, 64>, + ReductionConfiguration_1<256, 2, 128>, + ReductionConfiguration_1<256, 1, 256> + // clang-format on + >; + #ifdef QUICK_REDUCE_TEST using reduce_configuration_2_instances_blockwise = std::tuple< // clang-format off @@ -58,8 +72,8 @@ template + bool PropagateNan, + bool UseIndex> void add_device_reduce_instance_blockwise( std::vector>& device_op_instances) { @@ -73,92 +87,94 @@ void add_device_reduce_instance_blockwise( constexpr bool Indexable = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); + constexpr bool OutputIndex = Indexable && UseIndex; - constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; + static_for<0, std::tuple_size::value, 1>{}( + [&](auto i) { + using cfg1 = remove_cvref_t(reduce_configuration_1_instances_blockwise{}))>; - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { - using cfg1 = - remove_cvref_t(reduce_configuration_1_instances{}))>; + static_for<0, std::tuple_size::value, 1>{}( + [&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_blockwise{}))>; - static_for<0, std::tuple_size::value, 1>{}( - [&](auto j) { - using cfg2 = remove_cvref_t(reduce_configuration_2_instances_blockwise{}))>; + using ReduceOpInstance = + DeviceReduceMultiBlock; - using ReduceOpInstance = DeviceReduceBlockWise; - - device_op_instances.push_back( - std::make_unique(ReduceOpInstance{})); - }); - }); + device_op_instances.push_back( + std::make_unique(ReduceOpInstance{})); + }); + }); }; -#define ADD_BLOCKWISE_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - template void add_device_reduce_instance_blockwise( \ +#define ADD_BLOCKWISE_INST_BY_TYPE( \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ + template void add_device_reduce_instance_blockwise( \ std::vector> & device_op_instances) -#define ADD_BLOCKWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_BLOCKWISE_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_BLOCKWISE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ extern template void add_device_reduce_instance_blockwise( \ + PropagateNan, \ + UseIndex>( \ std::vector::InElementwiseOperation, \ typename reduce_unary_operator:: \ AccElementwiseOperation>> & \ device_op_instances) -#define ADD_BLOCKWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_BLOCKWISE_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp index 0ae3289a0d..3cad45f2e5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp index e7bdb15d92..441c1aec3f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp index dad0d86350..ca8532a458 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp index 34ec15db2b..64f504c9da 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp index b08f35ad09..9e84ee34fb 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp index 65cdd45340..a37e3bdeb9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp index f4a6677b3e..1d8695bbb0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp index 7f67138e6b..b5c19b7207 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_blockwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp deleted file mode 100644 index 8e47bbfb6a..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp +++ /dev/null @@ -1,165 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP - -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_impl_common.hpp" -#include "device_reduce_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -#ifdef QUICK_REDUCE_TEST -using reduce_configuration_2_instances_blockwise_second_call = std::tuple< - // clang-format off - // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize - ReductionConfiguration_2<1, 2, 1, 1, 2>, - ReductionConfiguration_2<1, 1, 1, 1, 3> - // clang-format on - >; -#else -using reduce_configuration_2_instances_blockwise_second_call = std::tuple< - // clang-format off - // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize - ReductionConfiguration_2<1, 4, 1, 1, 8>, - ReductionConfiguration_2<1, 4, 1, 1, 4>, - ReductionConfiguration_2<1, 2, 1, 1, 2>, - - ReductionConfiguration_2<1, 1, 1, 1, 3>, - ReductionConfiguration_2<1, 1, 1, 1, 5>, - ReductionConfiguration_2<1, 1, 1, 1, 7>, - ReductionConfiguration_2<1, 1, 1, 1, 11> - // clang-format on - >; -#endif - -template -using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - -template -void add_device_reduce_instance_blockwise_second_call( - std::vector>& - device_op_instances) -{ - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - - constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; - - static_assert(std::is_same::value, - "InDataType and AccDataType should be the same to use " - "add_device_reduce_instance_blockwise_second_call!"); - - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { - using cfg1 = - remove_cvref_t(reduce_configuration_1_instances{}))>; - - static_for<0, - std::tuple_size::value, - 1>{}([&](auto j) { - using cfg2 = remove_cvref_t(reduce_configuration_2_instances_blockwise_second_call{}))>; - - using ReduceOpInstance = DeviceReduceBlockWiseSecondCall; - - device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); - }); - }); -}; - -#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - template void add_device_reduce_instance_blockwise_second_call( \ - std::vector> & \ - device_op_instances) - -#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_blockwise_second_call( \ - std::vector< \ - DeviceReducePtr:: \ - InElementwiseOperation, \ - typename reduce_unary_operator:: \ - AccElementwiseOperation>> & \ - device_op_instances) - -#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp deleted file mode 100644 index 4ce19c7d0c..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp deleted file mode 100644 index c85419befc..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp deleted file mode 100644 index d42e7e020f..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp deleted file mode 100644 index fcf244d1d3..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp deleted file mode 100644 index 72e806ee60..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp deleted file mode 100644 index 476c3a7d8f..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp deleted file mode 100644 index d46780483b..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp deleted file mode 100644 index 7b020fb439..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP -#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp index b25645034c..721d98a718 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp @@ -30,20 +30,6 @@ struct ReductionConfiguration_2 static constexpr int KThreadSliceSize_ = KThreadSliceSize; }; -using reduce_configuration_1_instances = std::tuple< - // clang-format off - // BlockSize | MThreadClusterSize | KThreadClusterSize - ReductionConfiguration_1<256, 128, 2>, - ReductionConfiguration_1<256, 64, 4>, - ReductionConfiguration_1<256, 32, 8>, - ReductionConfiguration_1<256, 16, 16>, - ReductionConfiguration_1<256, 8, 32>, - ReductionConfiguration_1<256, 4, 64>, - ReductionConfiguration_1<256, 2, 128>, - ReductionConfiguration_1<256, 1, 256> - // clang-format on - >; - #define QUICK_REDUCE_TEST 1 } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index bf10080b5e..605109d077 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -3,13 +3,27 @@ #include "reduction_operator_mapping.hpp" #include "device_reduce_instance_impl_common.hpp" -#include "device_reduce_multiblock_atomic_add.hpp" +#include "device_reduce_multiblock.hpp" namespace ck { namespace tensor_operation { namespace device { namespace device_reduce_instance { +using reduce_configuration_1_instances_multiblock_atomic_add = std::tuple< + // clang-format off + // BlockSize | MThreadClusterSize | KThreadClusterSize + ReductionConfiguration_1<256, 128, 2>, + ReductionConfiguration_1<256, 64, 4>, + ReductionConfiguration_1<256, 32, 8>, + ReductionConfiguration_1<256, 16, 16>, + ReductionConfiguration_1<256, 8, 32>, + ReductionConfiguration_1<256, 4, 64>, + ReductionConfiguration_1<256, 2, 128>, + ReductionConfiguration_1<256, 1, 256> + // clang-format on + >; + #ifdef QUICK_REDUCE_TEST using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< // clang-format off @@ -60,8 +74,8 @@ template + bool PropagateNan, + bool UseIndex> void add_device_reduce_instance_multiblock_atomic_add( std::vector>& device_op_instances) @@ -76,12 +90,10 @@ void add_device_reduce_instance_multiblock_atomic_add( constexpr bool Indexable = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); + constexpr bool OutputIndex = Indexable && UseIndex; - constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; - - static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES, - "AtomicAdd can only be used with reduction operations without indices!"); + static_assert(UseIndex == false, + "AtomicAdd can only be used with reduction operations using no index!"); constexpr bool op_acceptable = (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL || @@ -94,9 +106,11 @@ void add_device_reduce_instance_multiblock_atomic_add( return; else { - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { - using cfg1 = - remove_cvref_t(reduce_configuration_1_instances{}))>; + static_for<0, + std::tuple_size::value, + 1>{}([&](auto i) { + using cfg1 = remove_cvref_t(reduce_configuration_1_instances_multiblock_atomic_add{}))>; static_for< 0, @@ -105,24 +119,27 @@ void add_device_reduce_instance_multiblock_atomic_add( using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_atomic_add{}))>; - using ReduceOpInstance = DeviceReduceMultiBlockAtomicAdd; + using ReduceOpInstance = + DeviceReduceMultiBlock; device_op_instances.push_back( std::make_unique(ReduceOpInstance{})); @@ -132,54 +149,54 @@ void add_device_reduce_instance_multiblock_atomic_add( }; #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ template void add_device_reduce_instance_multiblock_atomic_add( \ + PropagateNan, \ + UseIndex>( \ std::vector> & \ device_op_instances) -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ extern template void add_device_reduce_instance_multiblock_atomic_add( \ + PropagateNan, \ + UseIndex>( \ std::vector::InElementwiseOperation, \ typename reduce_unary_operator:: \ AccElementwiseOperation>> & \ device_op_instances) -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp index 58f90bb94f..4e39cf49f6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_multiblock_atomic_add.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp index f4c766ca03..73424322ae 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_multiblock_atomic_add.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp index c2f2564fc9..ecc9c4ea87 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_multiblock_atomic_add.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp index 830dcf9407..41a60d5b70 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_multiblock_atomic_add.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp new file mode 100644 index 0000000000..bdcca274d7 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp @@ -0,0 +1,29 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP + +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp deleted file mode 100644 index 5c323ec175..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp +++ /dev/null @@ -1,174 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP - -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_impl_common.hpp" -#include "device_reduce_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -#ifdef QUICK_REDUCE_TEST -using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< - // clang-format off - // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize - ReductionConfiguration_2<0, 1, 1, 2, 1>, - ReductionConfiguration_2<1, 2, 1, 1, 2>, - ReductionConfiguration_2<0, 1, 1, 3, 1>, - ReductionConfiguration_2<1, 1, 1, 1, 3> - // clang-format on - >; -#else -using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< - // clang-format off - // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize - ReductionConfiguration_2<0, 4, 1, 8, 1>, - ReductionConfiguration_2<0, 4, 1, 4, 1>, - ReductionConfiguration_2<0, 2, 1, 2, 1>, - - ReductionConfiguration_2<1, 4, 1, 1, 8>, - ReductionConfiguration_2<1, 4, 1, 1, 4>, - ReductionConfiguration_2<1, 2, 1, 1, 2>, - - // special instances - ReductionConfiguration_2<0, 1, 1, 3, 1>, - ReductionConfiguration_2<0, 1, 1, 5, 1>, - ReductionConfiguration_2<0, 1, 1, 7, 1>, - ReductionConfiguration_2<0, 1, 1, 11, 1>, - - ReductionConfiguration_2<0, 1, 1, 1, 3>, - ReductionConfiguration_2<0, 1, 1, 1, 5>, - ReductionConfiguration_2<0, 1, 1, 1, 7>, - ReductionConfiguration_2<0, 1, 1, 1, 11> - // clang-format on - >; -#endif - -template -using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - -template -void add_device_reduce_instance_multiblock_partial_reduce( - std::vector>& - device_op_instances) -{ - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - - constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; - - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { - using cfg1 = - remove_cvref_t(reduce_configuration_1_instances{}))>; - - static_for< - 0, - std::tuple_size::value, - 1>{}([&](auto j) { - using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_partial_reduce{}))>; - - using ReduceOpInstance = DeviceReduceMultiBlockPartialReduce; - - device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); - }); - }); -}; - -#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - template void add_device_reduce_instance_multiblock_partial_reduce( \ - std::vector> & \ - device_op_instances) - -#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_multiblock_partial_reduce( \ - std::vector< \ - DeviceReducePtr:: \ - InElementwiseOperation, \ - typename reduce_unary_operator:: \ - AccElementwiseOperation>> & \ - device_op_instances) - -#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp deleted file mode 100644 index d25645ad1e..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp deleted file mode 100644 index 05549fc702..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp deleted file mode 100644 index 3e4aaef51b..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp deleted file mode 100644 index 2a1e4e7bf0..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); - -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp deleted file mode 100644 index f95e3001ee..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp deleted file mode 100644 index fac65128b6..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); - -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); - -// Will be moved to use MultiBlockAtomicAdd -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp deleted file mode 100644 index 895c144c66..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp deleted file mode 100644 index d6bee57fcd..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP -#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP - -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck - -#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index f3a0781c2b..a2b4ae22be 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -58,8 +58,8 @@ template + bool PropagateNan, + bool UseIndex> void add_device_reduce_instance_threadwise( std::vector>& device_op_instances) { @@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise( constexpr bool Indexable = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - - constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; + constexpr bool OutputIndex = Indexable && UseIndex; using cfg1 = ReductionConfiguration_1<256, 256, 1>; @@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise( InElementwiseOperation, AccElementwiseOperation, PropagateNan, - NeedIndices, + OutputIndex, + false, // HaveIndexInputIfOutputIndex cfg1::BlockSize_, - cfg1::MThreadClusterSize_, - cfg1::KThreadClusterSize_, cfg2::MThreadSliceSize_, cfg2::KThreadSliceSize_, cfg2::InSrcVectorDim_, @@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise( }); }; -#define ADD_THREADWISE_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - template void add_device_reduce_instance_threadwise( \ +#define ADD_THREADWISE_INST_BY_TYPE( \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ + template void add_device_reduce_instance_threadwise( \ std::vector> & device_op_instances) -#define ADD_THREADWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_THREADWISE_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_THREADWISE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_THREADWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ extern template void add_device_reduce_instance_threadwise( \ + PropagateNan, \ + UseIndex>( \ std::vector::InElementwiseOperation, \ typename reduce_unary_operator:: \ AccElementwiseOperation>> & \ device_op_instances) -#define ADD_THREADWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_THREADWISE_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp index f11d9118c9..0291f33214 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp index fe220335c5..7ab1bebc5f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp index 970559cfac..39c3d10660 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp @@ -1,8 +1,7 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" +#include "data_type.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp index 66c33a72a4..3c47bfd189 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp index 196f142dbf..9df9f6f1fa 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp index 4f3e1448d0..00ab218f20 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp index 8f19a5d0a2..de7445b043 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp index 83bd48cd3f..1ea1ee745e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp @@ -1,8 +1,6 @@ #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP -#include "reduction_enums.hpp" -#include "reduction_operator_mapping.hpp" #include "device_reduce_instance_threadwise.hpp" namespace ck { diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt index 81987ac0d4..d566796c13 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt @@ -16,26 +16,11 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE device_reduce_instance_threadwise_i8_i32_i8.cpp; device_reduce_instance_threadwise_i8_i8_i8.cpp; device_reduce_instance_threadwise_b16_f32_b16.cpp; - device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp; - device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp; - device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp; - device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp; - device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp; - device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp; - device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp; - device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp; device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp; device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp; device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp; + device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp; device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp; - device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp; - device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp; - device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp; - device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp; - device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp; - device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp; - device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp; - device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp; ) add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE}) diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp deleted file mode 100644 index 82a9c11413..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp deleted file mode 100644 index 6b8139c32c..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp +++ /dev/null @@ -1,53 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp deleted file mode 100644 index 267b9d4d9d..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp deleted file mode 100644 index 0036a89542..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp deleted file mode 100644 index 0512fa4158..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp deleted file mode 100644 index afe7f0752e..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp deleted file mode 100644 index 9cb3b8684f..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp deleted file mode 100644 index 8783a75486..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "device_reduce_instance_blockwise_second_call.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp new file mode 100644 index 0000000000..497f2695be --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp @@ -0,0 +1,24 @@ +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp deleted file mode 100644 index d740fcfa8f..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp +++ /dev/null @@ -1,53 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp deleted file mode 100644 index f57ed5ad86..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp deleted file mode 100644 index 724b364104..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp deleted file mode 100644 index 15028a0b4c..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); - -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp deleted file mode 100644 index ec0ba3cf8e..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp deleted file mode 100644 index 9ff2dcd93b..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); - -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); - -// Will be moved to use MultiBlockAtomicAdd -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp deleted file mode 100644 index 0e37c2947f..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp deleted file mode 100644 index 4634faed06..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "device_reduce_instance_multiblock_partial_reduce.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_reduce_instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace device_reduce_instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 33c7929ddd..a87694754e 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -5,74 +5,77 @@ #include "device_reduce_instance.hpp" #include "reduction_enums.hpp" #include "host_reduction.hpp" +#include "host_common_util.hpp" +#include "host_tensor_generator.hpp" namespace ck { namespace tensor_operation { namespace device { namespace device_reduce_instance { -template +template struct ReduceDescription { static constexpr int Rank_ = Rank; static constexpr int NumReduceDim_ = NumReduceDim; static constexpr int ReduceOpId_ = ReduceOpId; - static constexpr int NanOpt_ = NanOpt; - static constexpr int IndicesOpt_ = IndicesOpt; + static constexpr int PropagateNan_ = PropagateNan; + static constexpr int UseIndex_ = UseIndex; }; -using reduce_description_instances = std::tuple, // for ADD - ReduceDescription<4, 4, 0, 0, 0>, - ReduceDescription<4, 1, 0, 0, 0>, - ReduceDescription<2, 1, 0, 0, 0>, +using reduce_description_instances = + std::tuple, // for ADD + ReduceDescription<4, 4, 0, false, false>, + ReduceDescription<4, 1, 0, false, false>, + ReduceDescription<2, 1, 0, false, false>, - ReduceDescription<4, 3, 5, 0, 0>, // for AVG - ReduceDescription<4, 4, 5, 0, 0>, - ReduceDescription<4, 1, 5, 0, 0>, - ReduceDescription<2, 1, 5, 0, 0>, + ReduceDescription<4, 3, 5, false, false>, // for AVG + ReduceDescription<4, 4, 5, false, false>, + ReduceDescription<4, 1, 5, false, false>, + ReduceDescription<2, 1, 5, false, false>, - ReduceDescription<4, 3, 7, 0, 0>, // for NORM2 - ReduceDescription<4, 4, 7, 0, 0>, - ReduceDescription<4, 1, 7, 0, 0>, - ReduceDescription<2, 1, 7, 0, 0>, + ReduceDescription<4, 3, 7, false, false>, // for NORM2 + ReduceDescription<4, 4, 7, false, false>, + ReduceDescription<4, 1, 7, false, false>, + ReduceDescription<2, 1, 7, false, false>, - ReduceDescription<4, 3, 2, 0, 0>, // for MIN - ReduceDescription<4, 4, 2, 0, 0>, - ReduceDescription<4, 1, 2, 0, 0>, - ReduceDescription<2, 1, 2, 0, 0>, - ReduceDescription<4, 3, 3, 0, 0>, // for MAX - ReduceDescription<4, 4, 3, 0, 0>, - ReduceDescription<4, 1, 3, 0, 0>, - ReduceDescription<2, 1, 3, 0, 0>, - ReduceDescription<4, 3, 4, 0, 0>, // for AMAX - ReduceDescription<4, 4, 4, 0, 0>, - ReduceDescription<4, 1, 4, 0, 0>, - ReduceDescription<2, 1, 4, 0, 0>, + ReduceDescription<4, 3, 2, false, false>, // for MIN + ReduceDescription<4, 4, 2, false, false>, + ReduceDescription<4, 1, 2, false, false>, + ReduceDescription<2, 1, 2, false, false>, + ReduceDescription<4, 3, 3, false, false>, // for MAX + ReduceDescription<4, 4, 3, false, false>, + ReduceDescription<4, 1, 3, false, false>, + ReduceDescription<2, 1, 3, false, false>, + ReduceDescription<4, 3, 4, false, false>, // for AMAX + ReduceDescription<4, 4, 4, false, false>, + ReduceDescription<4, 1, 4, false, false>, + ReduceDescription<2, 1, 4, false, false>, - ReduceDescription<4, 3, 2, 0, 1>, // for MIN - ReduceDescription<4, 4, 2, 0, 1>, - ReduceDescription<4, 1, 2, 0, 1>, - ReduceDescription<2, 1, 2, 0, 1>, - ReduceDescription<4, 3, 3, 0, 1>, // for MAX - ReduceDescription<4, 4, 3, 0, 1>, - ReduceDescription<4, 1, 3, 0, 1>, - ReduceDescription<2, 1, 3, 0, 1>, - ReduceDescription<4, 3, 4, 0, 1>, // for AMAX - ReduceDescription<4, 4, 4, 0, 1>, - ReduceDescription<4, 1, 4, 0, 1>, - ReduceDescription<2, 1, 4, 0, 1>>; + ReduceDescription<4, 3, 2, false, true>, // for MIN + ReduceDescription<4, 4, 2, false, true>, + ReduceDescription<4, 1, 2, false, true>, + ReduceDescription<2, 1, 2, false, true>, + ReduceDescription<4, 3, 3, false, true>, // for MAX + ReduceDescription<4, 4, 3, false, true>, + ReduceDescription<4, 1, 3, false, true>, + ReduceDescription<2, 1, 3, false, true>, + ReduceDescription<4, 3, 4, false, true>, // for AMAX + ReduceDescription<4, 4, 4, false, true>, + ReduceDescription<4, 1, 4, false, true>, + ReduceDescription<2, 1, 4, false, true>>; template bool description_match(const DescriptionType& description, int Rank, const std::vector& reduceDims, ReduceTensorOp ReduceOpId, - NanPropagation NanOpt, - ReduceTensorIndices IndicesOpt) + bool PropagateNan, + bool UseIndex) { if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast(ReduceOpId) || - description.NanOpt_ != static_cast(NanOpt) || - description.IndicesOpt_ != static_cast(IndicesOpt)) + description.PropagateNan_ != static_cast(PropagateNan) || + description.UseIndex_ != static_cast(UseIndex)) return (false); if(DescriptionType::NumReduceDim_ != reduceDims.size()) @@ -116,46 +119,16 @@ static inline std::vector get_invariant_dims(const std::vector& reduce return invariantDims; }; -template -static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) -{ - std::ofstream outFile(fileName, std::ios::binary); - if(outFile) - { - outFile.write(reinterpret_cast(data), dataNumItems * sizeof(T)); - outFile.close(); - std::cout << "Write output to file " << fileName << std::endl; - } - else - { - std::cout << "Could not open file " << fileName << " for writing" << std::endl; - } -}; - -// map the data type used by the GPU kernels to the corresponding type used by the host codes -template -struct type_mapping -{ - using OutType = InType; -}; - -template <> -struct type_mapping -{ - using OutType = half_float::half; -}; - template -void profile_reduce_impl_impl(bool do_verification, + bool PropagateNan, + bool UseIndex> +bool profile_reduce_impl_impl(bool do_verification, int init_method, - bool do_log, bool do_dumpout, bool time_kernel, const std::vector& inLengths, @@ -166,15 +139,13 @@ void profile_reduce_impl_impl(bool do_verification, using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device::device_reduce_instance; using namespace ck::host_reduce; + using ck::host_common::dumpBufferToFile; constexpr bool op_support_indices = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool NeedIndices = - (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES)); - - constexpr bool PropagateNan = (NanOpt == NanPropagation::PROPAGATE_NAN); + constexpr bool OutputIndex = (op_support_indices && UseIndex); constexpr bool out_support_atomic_add = std::is_same::value; constexpr bool op_support_atomic_add = @@ -195,8 +166,7 @@ void profile_reduce_impl_impl(bool do_verification, (op_support_indices && !std::is_same::value); // 1) The indices can only be used when the reduction operation is indexable - constexpr bool invalid_reduce_3 = - (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES); + constexpr bool invalid_reduce_3 = (!op_support_indices && UseIndex); // 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations // 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction @@ -219,6 +189,8 @@ void profile_reduce_impl_impl(bool do_verification, constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6); + bool pass = true; + if constexpr(!invalid_reduce) { Tensor in(inLengths); @@ -282,7 +254,7 @@ void profile_reduce_impl_impl(bool do_verification, if(beta != 0.0f) out_dev.ToDevice(out.mData.data()); - size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0; + size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0; DeviceMem out_indices_dev(indicesSizeInBytes); @@ -295,29 +267,11 @@ void profile_reduce_impl_impl(bool do_verification, using AccElementwiseOperation_0 = typename reduce_unary_operator:: AccElementwiseOperation; - using InElementwiseOperation_1 = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation_1 = - typename reduce_unary_operator:: - AccElementwiseOperation; - using InElementwiseOperation_2 = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation_2 = - typename reduce_unary_operator:: - AccElementwiseOperation; using DeviceReduceInstPtr0 = DeviceReducePtr; - using DeviceReduceInstPtr1 = - DeviceReducePtr; - using DeviceReduceInstPtr2 = - DeviceReducePtr; std::vector reduce0_ptrs; - std::vector reduce1_ptrs; - std::vector reduce2_ptrs; add_device_reduce_instance_threadwise(reduce0_ptrs); + PropagateNan, + UseIndex>(reduce0_ptrs); add_device_reduce_instance_blockwise(reduce0_ptrs); + PropagateNan, + UseIndex>(reduce0_ptrs); if constexpr(use_atomic_add) { @@ -345,35 +299,11 @@ void profile_reduce_impl_impl(bool do_verification, Rank, NumReduceDim, ReduceOpId, - NanOpt, - IndicesOpt>(reduce0_ptrs); + PropagateNan, + UseIndex>(reduce0_ptrs); } - else - { - add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); - }; - // used for secondary reduction - if constexpr(!use_atomic_add) - { - add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); - }; - - if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) + if(reduce0_ptrs.empty()) { throw std::runtime_error("Wrong! No device REDUCE instance found"); }; @@ -387,23 +317,25 @@ void profile_reduce_impl_impl(bool do_verification, Rank, NumReduceDim, PropagateNan, - NeedIndices> + OutputIndex> hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce.Run( alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); }; - const auto i_inLengths = to_int_vector(inLengths); - const auto i_inStrides = to_int_vector(inStrides); - const auto i_outLengths = to_int_vector(outLengths); - const auto i_outStrides = to_int_vector(outStrides); + std::vector i_inLengths; + std::vector i_inStrides; + std::vector i_outLengths; + std::vector i_outStrides; + + i_inLengths.assign(inLengths.begin(), inLengths.end()); + i_inStrides.assign(inStrides.begin(), inStrides.end()); + i_outLengths.assign(outLengths.begin(), outLengths.end()); + i_outStrides.assign(outStrides.begin(), outStrides.end()); for(auto& reduce_ptr : reduce0_ptrs) { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); InElementwiseOperation_0 in_elementwise_op_0(static_cast(reduce_total_length)); AccElementwiseOperation_0 acc_elementwise_op_0( @@ -417,9 +349,9 @@ void profile_reduce_impl_impl(bool do_verification, alpha, beta, in_dev.GetDeviceBuffer(), + nullptr, out_dev.GetDeviceBuffer(), out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), in_elementwise_op_0, acc_elementwise_op_0); @@ -439,8 +371,9 @@ void profile_reduce_impl_impl(bool do_verification, float gb_per_sec = num_bytes / 1.E6 / avg_time; - std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name - << std::endl; + if(time_kernel) + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " + << reduce_name << std::endl; if(gb_per_sec > best_gb_per_sec) { @@ -450,22 +383,24 @@ void profile_reduce_impl_impl(bool do_verification, if(do_verification) { - out_dev.FromDevice(out.mData.data()); - ck::utils::check_err(out.mData, out_ref.mData); + bool single_pass; - if(NeedIndices) + out_dev.FromDevice(out.mData.data()); + single_pass = ck::utils::check_err(out.mData, out_ref.mData); + + if(OutputIndex) { out_indices_dev.FromDevice(out_indices.mData.data()); - ck::utils::check_err(out_indices.mData, out_indices_ref.mData); - ; + single_pass = single_pass && + ck::utils::check_err(out_indices.mData, out_indices_ref.mData); }; - if(do_log) + if(!single_pass) { - LogRangeAsType(std::cout << "out_host : ", out_ref.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "out_device: ", out.mData, ",") << std::endl; - }; + std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl; + } + + pass = pass && single_pass; }; if(do_dumpout) @@ -474,7 +409,7 @@ void profile_reduce_impl_impl(bool do_verification, dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize()); dumpBufferToFile( "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize()); - if(NeedIndices) + if(OutputIndex) { dumpBufferToFile("dump_indices.bin", out_indices.mData.data(), @@ -486,158 +421,34 @@ void profile_reduce_impl_impl(bool do_verification, }; }; - for(auto& reduce_ptr : reduce1_ptrs) - { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); - - InElementwiseOperation_1 in_elementwise_op_1(static_cast(reduce_total_length)); - AccElementwiseOperation_1 acc_elementwise_op_1( - static_cast(reduce_total_length)); - - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - in_elementwise_op_1, - acc_elementwise_op_1); - - if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) - continue; - - std::string reduce_name = reduce_ptr->GetTypeString(); - - auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - - float avg_time = - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t num_bytes = - invariant_total_length * reduce_total_length * sizeof(InDataType) + - invariant_total_length * sizeof(OutDataType); - - std::vector inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get()); - std::vector inStrides2{inLengths2[1], 1}; - - for(auto& reduce2_ptr : reduce2_ptrs) - { - InElementwiseOperation_2 in_elementwise_op_2( - static_cast(reduce_total_length)); - AccElementwiseOperation_2 acc_elementwise_op_2( - static_cast(reduce_total_length)); - - auto argument2_ptr = - reduce2_ptr->MakeArgumentPointer(inLengths2, - inStrides2, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - ws_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - in_elementwise_op_2, - acc_elementwise_op_2); - - if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) - continue; - - std::string reduce2_name = reduce2_ptr->GetTypeString(); - - auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); - - float avg_time_2 = - invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t num_bytes_2 = - static_cast(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType); - - float gb_per_sec = (num_bytes + num_bytes_2) / 1.E6 / (avg_time + avg_time_2); - - std::cout << "Perf: " << (avg_time + avg_time_2) << " ms, " << gb_per_sec - << " GB/s, " << reduce_name << " => " << reduce2_name << std::endl; - - if(gb_per_sec > best_gb_per_sec) - { - best_avg_time = avg_time + avg_time_2; - best_gb_per_sec = gb_per_sec; - } - - if(do_verification) - { - out_dev.FromDevice(out.mData.data()); - ck::utils::check_err(out.mData, out_ref.mData); - - if(NeedIndices) - { - out_indices_dev.FromDevice(out_indices.mData.data()); - ck::utils::check_err(out_indices.mData, out_indices_ref.mData); - ; - }; - - if(do_log) - { - LogRangeAsType(std::cout << "out_host : ", out_ref.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "out_device: ", out.mData, ",") - << std::endl; - } - } - - if(do_dumpout) - { - dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize()); - dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize()); - dumpBufferToFile( - "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize()); - if(NeedIndices) - { - dumpBufferToFile("dump_indices.bin", - out_indices.mData.data(), - out_indices.mDesc.GetElementSize()); - dumpBufferToFile("dump_indices_host.bin", - out_indices_ref.mData.data(), - out_indices_ref.mDesc.GetElementSize()); - }; - }; - }; - }; - - std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s" - << std::endl; + if(time_kernel) + std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s" + << std::endl; } else { std::cout << "The requested reduction operation is not supported, please check !!!" << std::endl; }; + + return pass; }; template -void profile_reduce_impl(bool do_verification, +bool profile_reduce_impl(bool do_verification, int init_method, - bool do_log, bool do_dumpout, bool time_kernel, const std::vector& inLengths, const std::vector& reduceDims, ReduceTensorOp ReduceOpId, - NanPropagation NanOpt, - ReduceTensorIndices IndicesOpt, + bool PropagateNan, + bool UseIndex, float alpha, float beta) { bool matched = false; + bool pass = true; using tuple_of_description_instances = tensor_operation::device::device_reduce_instance::reduce_description_instances; @@ -651,29 +462,30 @@ void profile_reduce_impl(bool do_verification, using descType = remove_cvref_t(tuple_object))>; if(!description_match( - descType{}, inLengths.size(), reduceDims, ReduceOpId, NanOpt, IndicesOpt)) + descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex)) return; - profile_reduce_impl_impl(descType::ReduceOpId_), - static_cast(descType::NanOpt_), - static_cast(descType::IndicesOpt_)>( - do_verification, - init_method, - do_log, - do_dumpout, - time_kernel, - inLengths, - reduceDims, - alpha, - beta); + pass = pass && + profile_reduce_impl_impl(descType::ReduceOpId_), + static_cast(descType::PropagateNan_), + static_cast(descType::UseIndex_)>(do_verification, + init_method, + do_dumpout, + time_kernel, + inLengths, + reduceDims, + alpha, + beta); matched = true; }); + + return pass; }; } // namespace profiler diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index 5e91a1d2d1..bdbac4fab4 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -1,27 +1,19 @@ #include #include -#include -#include #include #include #include #include #include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "device_tensor.hpp" +#include "data_type_enum.hpp" #include "reduction_enums.hpp" +#include "host_common_util.hpp" #include "profile_reduce_impl.hpp" using namespace std; -using ck::NanPropagation; -using ck::ReduceTensorIndices; using ck::ReduceTensorOp; static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, @@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr, {"bf16", no_argument, nullptr, '?'}, {"dumpout", required_argument, nullptr, 'o'}, {"verify", required_argument, nullptr, 'v'}, - {"log", required_argument, nullptr, 'l'}, {"help", no_argument, nullptr, '?'}, {nullptr, 0, nullptr, 0}}; -template -static T getSingleValueFromString(const string& valueStr) -{ - std::istringstream iss(valueStr); - - T val; - - iss >> val; - - return (val); -}; - -template -static std::vector getTypeValuesFromString(const char* cstr_values) -{ - std::string valuesStr(cstr_values); - - std::vector values; - std::size_t pos = 0; - std::size_t new_pos; - - new_pos = valuesStr.find(',', pos); - while(new_pos != std::string::npos) - { - const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); - - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - pos = new_pos + 1; - new_pos = valuesStr.find(',', pos); - }; - - std::string sliceStr = valuesStr.substr(pos); - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - return (values); -} - -enum struct AppDataType -{ - appHalf = 0, - appFloat = 1, - appInt32 = 2, - appInt8 = 3, - appInt8x4 = 4, - appBFloat16 = 5, - appDouble = 6, -}; - static void check_reduce_dims(const int rank, const std::vector& reduceDims) { for(auto dim : reduceDims) @@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector& reduceDims }; }; -class AppArgs +class ReduceProfilerArgs { private: int option_index = 0; @@ -130,26 +68,23 @@ class AppArgs std::vector scales; - ReduceTensorOp reduceOp = ReduceTensorOp::ADD; - AppDataType compTypeId = AppDataType::appFloat; - AppDataType outTypeId = AppDataType::appFloat; + ReduceTensorOp reduceOp = ReduceTensorOp::ADD; + ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float; + ck::DataTypeEnum outTypeId = ck::DataTypeEnum::Float; bool compType_assigned = false; bool outType_assigned = false; - NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN; - ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES; - bool do_log = false; - bool do_verification = false; - bool do_dumpout = false; + int nanOpt = 0; + int indicesOpt = 0; + bool do_verification = false; + bool do_dumpout = false; int init_method; bool time_kernel; - bool need_indices = false; - - AppArgs() = default; - ~AppArgs() = default; + ReduceProfilerArgs() = default; + ~ReduceProfilerArgs() = default; void show_usage(const char* cmd) { @@ -166,8 +101,11 @@ class AppArgs std::cout << "--outType or -W, optional enum value indicating the type of the reduced " "output, which could be float when the input data is half" << std::endl; - std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl; - std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt" + std::cout + << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation" + << std::endl; + std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use " + "index in reduction" << std::endl; std::cout << "--scales or -S, comma separated two float values for alpha and beta" << std::endl; @@ -181,18 +119,19 @@ class AppArgs std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files " "for further analysis" << std::endl; - std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl; }; int processArgs(int argc, char* argv[]) { + using ck::host_common::getTypeValuesFromString; + int ch; optind++; // to skip the "reduce" module name while(1) { - ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index); + ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index); if(ch == -1) break; switch(ch) @@ -219,27 +158,27 @@ class AppArgs if(!optarg) throw std::runtime_error("Invalid option format!"); - compTypeId = static_cast(std::atoi(optarg)); + compTypeId = static_cast(std::atoi(optarg)); compType_assigned = true; break; case 'W': if(!optarg) throw std::runtime_error("Invalid option format!"); - outTypeId = static_cast(std::atoi(optarg)); + outTypeId = static_cast(std::atoi(optarg)); outType_assigned = true; break; case 'N': if(!optarg) throw std::runtime_error("Invalid option format!"); - nanOpt = static_cast(std::atoi(optarg)); + nanOpt = std::atoi(optarg); break; case 'I': if(!optarg) throw std::runtime_error("Invalid option format!"); - indicesOpt = static_cast(std::atoi(optarg)); + indicesOpt = std::atoi(optarg); break; case 'S': if(!optarg) @@ -262,12 +201,6 @@ class AppArgs do_dumpout = static_cast(std::atoi(optarg)); break; - case 'l': - if(!optarg) - throw std::runtime_error("Invalid option format!"); - - do_log = static_cast(std::atoi(optarg)); - break; case '?': if(std::string(long_options[option_index].name) == "half") use_half = true; @@ -295,7 +228,7 @@ class AppArgs throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); init_method = std::atoi(argv[optind++]); - time_kernel = std::atoi(argv[optind]); + time_kernel = static_cast(std::atoi(argv[optind])); if(scales.empty()) { @@ -306,9 +239,6 @@ class AppArgs if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX || reduceOp == ReduceTensorOp::AMAX) { - if(indicesOpt != ReduceTensorIndices::NO_INDICES) - need_indices = true; - // for indexable operations, no need to assign compType and outType, just let them be // same as inType compType_assigned = false; @@ -322,9 +252,10 @@ class AppArgs int profile_reduce(int argc, char* argv[]) { - using namespace ck::profiler; + using ck::DataTypeEnum; + using ck::profiler::profile_reduce_impl; - AppArgs args; + ReduceProfilerArgs args; if(args.processArgs(argc, argv) < 0) return (-1); @@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[]) if(args.use_half) { if(!args.compType_assigned) - args.compTypeId = AppDataType::appHalf; + args.compTypeId = DataTypeEnum::Half; if(args.outType_assigned && - (args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat)) - args.outTypeId = AppDataType::appFloat; + (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float)) + args.outTypeId = DataTypeEnum::Float; if(!args.outType_assigned) - args.outTypeId = AppDataType::appHalf; + args.outTypeId = DataTypeEnum::Half; - if(args.compTypeId == AppDataType::appHalf) + if(args.compTypeId == DataTypeEnum::Half) { - profile_reduce_impl(args.do_verification, - args.init_method, - args.do_log, - args.do_dumpout, - args.time_kernel, - args.inLengths, - args.reduceDims, - args.reduceOp, - args.nanOpt, - args.indicesOpt, - args.scales[0], - args.scales[1]); + profile_reduce_impl( + args.do_verification, + args.init_method, + args.do_dumpout, + args.time_kernel, + args.inLengths, + args.reduceDims, + args.reduceOp, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), + args.scales[0], + args.scales[1]); } - else if(args.compTypeId == AppDataType::appFloat) + else if(args.compTypeId == DataTypeEnum::Float) { profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } @@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[]) { profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } else if(args.use_int8) { if(!args.compType_assigned) - args.compTypeId = AppDataType::appInt8; + args.compTypeId = DataTypeEnum::Int8; if(args.outType_assigned && - (args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32)) - args.outTypeId = AppDataType::appInt32; + (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32)) + args.outTypeId = DataTypeEnum::Int32; if(!args.outType_assigned) - args.outTypeId = AppDataType::appInt8; + args.outTypeId = DataTypeEnum::Int8; - if(args.compTypeId == AppDataType::appInt8) + if(args.compTypeId == DataTypeEnum::Int8) { profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } - else if(args.compTypeId == AppDataType::appInt32) + else if(args.compTypeId == DataTypeEnum::Int32) { profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } @@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[]) else if(args.use_bf16) { if(args.outType_assigned && - (args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat)) - args.outTypeId = AppDataType::appFloat; + (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float)) + args.outTypeId = DataTypeEnum::Float; if(!args.outType_assigned) - args.outTypeId = AppDataType::appBFloat16; + args.outTypeId = DataTypeEnum::BFloat16; profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } else { - if(args.compTypeId == AppDataType::appFloat) + if(args.compTypeId == DataTypeEnum::Float) { profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } - else if(args.compTypeId == AppDataType::appDouble) + else if(args.compTypeId == DataTypeEnum::Double) { profile_reduce_impl(args.do_verification, args.init_method, - args.do_log, args.do_dumpout, args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, - args.nanOpt, - args.indicesOpt, + static_cast(args.nanOpt), + static_cast(args.indicesOpt), args.scales[0], args.scales[1]); } diff --git a/script/test_reduce_no_index.sh b/script/test_reduce_no_index.sh index 95e563c93c..b956303837 100755 --- a/script/test_reduce_no_index.sh +++ b/script/test_reduce_no_index.sh @@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2 bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2 bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2 +## for float64 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 2 6 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 3 6 2 + ## for float16 bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2 bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2 diff --git a/script/test_reduce_with_index.sh b/script/test_reduce_with_index.sh index 8e7ed33847..b0843ba6c1 100755 --- a/script/test_reduce_with_index.sh +++ b/script/test_reduce_with_index.sh @@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2 bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2 bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2 +## for float64 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 2 6 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 3 6 2 + ## for float16 bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2 bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2 diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp index 317abab53a..20030392b5 100644 --- a/test/reduce/reduce_no_index.cpp +++ b/test/reduce/reduce_no_index.cpp @@ -1,384 +1,10 @@ #include "getopt.h" -#include "check_err.hpp" -#include "device_reduce_instance.hpp" -#include "reduction_enums.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "host_reduction.hpp" -#include "reduce_util.hpp" +#include "host_common_util.hpp" +#include "profile_reduce_impl.hpp" using namespace ck; -namespace { - -template -static inline std::vector get_invariant_dims(const std::vector& reduceDims) -{ - assert(NumReduceDim == reduceDims.size()); - - int reduceFlag = 0; - - // flag the bits for the reduceDims - for(int i = 0; i < NumReduceDim; i++) - { - reduceFlag |= 1 << reduceDims[i]; - }; - - std::vector invariantDims; - - // collect invariant dimensions - for(int i = 0; i < Rank; i++) - if((reduceFlag & (1 << i)) == 0) - { - invariantDims.push_back(i); - }; - - return invariantDims; -}; - -constexpr int Rank = 4; - -constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; -constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN; -constexpr bool PropagateNan = false; -constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES; -constexpr bool NeedIndices = false; - -template -bool test_reduce_no_index_impl(int init_method, - const std::vector& inLengths, - const std::vector& reduceDims, - float alpha, - float beta) -{ - using namespace ck::tensor_operation::device; - using namespace ck::tensor_operation::device::device_reduce_instance; - using namespace ck::host_reduce; - - constexpr bool out_support_atomic_add = std::is_same::value; - constexpr bool op_support_atomic_add = true; - constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add); - - Tensor in(inLengths); - - std::vector outLengths; - - const auto invariantDims = get_invariant_dims(reduceDims); - - if(reduceDims.size() == Rank) - outLengths.push_back(1); - else - for(auto dim : invariantDims) - outLengths.push_back(inLengths[dim]); - - Tensor out_ref(outLengths); - Tensor out(outLengths); - - // only used when the OutDataType is bhalf_t - Tensor out_ref_fp32(outLengths); - Tensor out_fp32(outLengths); - - auto inStrides = in.mDesc.GetStrides(); - auto outStrides = out.mDesc.GetStrides(); - - size_t invariant_total_length = out.mDesc.GetElementSize(); - size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; - - std::size_t num_thread = 1; - - switch(init_method) - { - case 0: break; - case 1: - in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - break; - case 2: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - default: - in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); - } - - if(beta != 0.0f) - for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) - out.mData[i] = out_ref.mData[i]; - - // these buffers are usually provided by the user application - DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); - DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); - - in_dev.ToDevice(in.mData.data()); - - if(beta != 0.0f) - out_dev.ToDevice(out.mData.data()); - - using InElementwiseOperation_0 = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation_0 = - typename reduce_unary_operator:: - AccElementwiseOperation; - using InElementwiseOperation_1 = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation_1 = - typename reduce_unary_operator:: - AccElementwiseOperation; - using InElementwiseOperation_2 = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation_2 = - typename reduce_unary_operator:: - AccElementwiseOperation; - - using DeviceReduceInstPtr0 = - DeviceReducePtr; - using DeviceReduceInstPtr1 = - DeviceReducePtr; - using DeviceReduceInstPtr2 = - DeviceReducePtr; - - std::vector reduce0_ptrs; - std::vector reduce1_ptrs; - std::vector reduce2_ptrs; - - add_device_reduce_instance_threadwise(reduce0_ptrs); - - add_device_reduce_instance_blockwise(reduce0_ptrs); - - if constexpr(use_atomic_add) - { - add_device_reduce_instance_multiblock_atomic_add(reduce0_ptrs); - } - else - { - add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); - }; - - // used for secondary reduction - if constexpr(!use_atomic_add) - { - add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); - }; - - if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) - { - throw std::runtime_error("Wrong! No device REDUCE instance found"); - }; - - bool result = true; - - ReductionHost - hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr); - - const auto i_inLengths = to_int_vector(inLengths); - const auto i_inStrides = to_int_vector(inStrides); - const auto i_outLengths = to_int_vector(outLengths); - const auto i_outStrides = to_int_vector(outStrides); - - for(auto& reduce_ptr : reduce0_ptrs) - { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); - - InElementwiseOperation_0 in_elementwise_op_0(static_cast(reduce_total_length)); - AccElementwiseOperation_0 acc_elementwise_op_0(static_cast(reduce_total_length)); - - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - nullptr, - ws_dev.GetDeviceBuffer(), - in_elementwise_op_0, - acc_elementwise_op_0); - - if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) - continue; - - auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - - (void)invoker_ptr->Run(argument_ptr.get()); - - out_dev.FromDevice(out.mData.data()); - - bool single_result = true; - - if constexpr(std::is_same::value || - std::is_same::value) - { - reduce_util::to_f32_vector(out, out_fp32); - reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = ck::utils::check_err( - out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); - } - else - { - single_result = - ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); - }; - - if(!single_result) - { - std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl; - result = false; - } - }; - - for(auto& reduce_ptr : reduce1_ptrs) - { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); - - InElementwiseOperation_1 in_elementwise_op_1(static_cast(reduce_total_length)); - AccElementwiseOperation_1 acc_elementwise_op_1(static_cast(reduce_total_length)); - - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - nullptr, - ws_dev.GetDeviceBuffer(), - in_elementwise_op_1, - acc_elementwise_op_1); - - if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) - continue; - - auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - - (void)invoker_ptr->Run(argument_ptr.get()); - - std::vector inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get()); - std::vector inStrides2{inLengths2[1], 1}; - - for(auto& reduce2_ptr : reduce2_ptrs) - { - InElementwiseOperation_2 in_elementwise_op_2(static_cast(reduce_total_length)); - AccElementwiseOperation_2 acc_elementwise_op_2( - static_cast(reduce_total_length)); - - auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2, - inStrides2, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - ws_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - nullptr, - ws_dev.GetDeviceBuffer(), - in_elementwise_op_2, - acc_elementwise_op_2); - - if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) - continue; - - std::string reduce2_name = reduce2_ptr->GetTypeString(); - - auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); - - (void)invoker2_ptr->Run(argument2_ptr.get()); - - out_dev.FromDevice(out.mData.data()); - - bool single_result = true; - - if constexpr(std::is_same::value || - std::is_same::value) - { - reduce_util::to_f32_vector(out, out_fp32); - reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = ck::utils::check_err( - out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); - } - else - { - single_result = - ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); - }; - - if(!single_result) - { - std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => " - << reduce2_ptr->GetTypeString() << std::endl; - result = false; - } - }; - }; - - return (result); -}; - -} // anonymous namespace - static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, {"reduceDimensions", required_argument, nullptr, 'R'}, {"scales", required_argument, nullptr, 'S'}, @@ -387,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr, class SimpleAppArgs { - template - static T getSingleValueFromString(const std::string& valueStr) - { - std::istringstream iss(valueStr); - - T ret; - - iss >> ret; - - return (ret); - }; - - template - static std::vector getTypeValuesFromString(const char* cstr_values) - { - std::string valuesStr(cstr_values); - - std::vector values; - std::size_t pos = 0; - std::size_t new_pos; - - new_pos = valuesStr.find(',', pos); - while(new_pos != std::string::npos) - { - const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); - - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - pos = new_pos + 1; - new_pos = valuesStr.find(',', pos); - }; - - std::string sliceStr = valuesStr.substr(pos); - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - return (values); - }; - private: int option_index = 0; @@ -460,6 +44,8 @@ class SimpleAppArgs int processArgs(int argc, char* argv[]) { + using ck::host_common::getTypeValuesFromString; + int ch; while(1) @@ -514,7 +100,7 @@ class SimpleAppArgs (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4)) return (-1); - if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5) + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6) return (-1); return (0); @@ -525,87 +111,92 @@ bool test_reduce_no_index(int data_type, int init_method, std::vector reduceDims, std::vector inLengths, + ReduceTensorOp reduceOpId, + bool propagateNan, float alpha, float beta) { + using ck::profiler::profile_reduce_impl; + bool result = true; if(data_type == 0) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + false, + alpha, + beta); } else if(data_type == 1) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + false, + alpha, + beta); } else if(data_type == 3) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + false, + alpha, + beta); } else if(data_type == 5) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_no_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + false, + alpha, + beta); + } + else if(data_type == 6) + { + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + false, + alpha, + beta); } return (result); }; +constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG; +constexpr bool propagateNan = false; + int main(int argc, char* argv[]) { SimpleAppArgs args; @@ -621,8 +212,14 @@ int main(int argc, char* argv[]) {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}}; for(auto& reduceDims : v_reduceDims) - result = result && test_reduce_no_index( - data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f); + result = result && test_reduce_no_index(data_type, + init_method, + reduceDims, + inLengths, + reduceOpId, + propagateNan, + 1.0f, + 0.0f); } else { @@ -636,6 +233,8 @@ int main(int argc, char* argv[]) args.init_method, args.reduceDims, args.inLengths, + reduceOpId, + propagateNan, args.scales[0], args.scales[1]); } diff --git a/test/reduce/reduce_util.hpp b/test/reduce/reduce_util.hpp deleted file mode 100644 index 9eb66513bf..0000000000 --- a/test/reduce/reduce_util.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef REDUCE_UTILS_HPP -#define REDUCE_UTILS_HPP - -#include "data_type.hpp" - -namespace ck { -namespace reduce_util { - -template -void to_f32_vector(const Tensor& src, Tensor& dst) -{ - for(std::size_t i = 0; i < src.mData.size(); ++i) - dst.mData[i] = type_convert(src.mData[i]); -} - -} // namespace reduce_util - -} // namespace ck -#endif diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp index d7d5e551a2..c1918bf388 100644 --- a/test/reduce/reduce_with_index.cpp +++ b/test/reduce/reduce_with_index.cpp @@ -1,387 +1,10 @@ #include "getopt.h" -#include "device_reduce_instance.hpp" -#include "reduction_enums.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "host_reduction.hpp" -#include "check_err.hpp" -#include "reduce_util.hpp" + +#include "host_common_util.hpp" +#include "profile_reduce_impl.hpp" using namespace ck; -namespace { - -template -static inline std::vector get_invariant_dims(const std::vector& reduceDims) -{ - assert(NumReduceDim == reduceDims.size()); - - int reduceFlag = 0; - - // flag the bits for the reduceDims - for(int i = 0; i < NumReduceDim; i++) - { - reduceFlag |= 1 << reduceDims[i]; - }; - - std::vector invariantDims; - - // collect invariant dimensions - for(int i = 0; i < Rank; i++) - if((reduceFlag & (1 << i)) == 0) - { - invariantDims.push_back(i); - }; - - return invariantDims; -}; - -constexpr int Rank = 4; - -constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX; -constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN; -constexpr bool PropagateNan = false; -constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::FLATTENED_INDICES; -constexpr bool NeedIndices = true; - -template -bool test_reduce_with_index_impl(int init_method, - const std::vector& inLengths, - const std::vector& reduceDims, - float alpha, - float beta) -{ - using namespace ck::tensor_operation::device; - using namespace ck::tensor_operation::device::device_reduce_instance; - using namespace ck::host_reduce; - - Tensor in(inLengths); - - std::vector outLengths; - - const auto invariantDims = get_invariant_dims(reduceDims); - - if(reduceDims.size() == Rank) - outLengths.push_back(1); - else - for(auto dim : invariantDims) - outLengths.push_back(inLengths[dim]); - - Tensor out_ref(outLengths); - Tensor out(outLengths); - Tensor out_indices_ref(outLengths); - Tensor out_indices(outLengths); - - // only used when the OutDataType is bhalf_t - Tensor out_ref_fp32(outLengths); - Tensor out_fp32(outLengths); - - auto inStrides = in.mDesc.GetStrides(); - auto outStrides = out.mDesc.GetStrides(); - - size_t invariant_total_length = out.mDesc.GetElementSize(); - size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; - - std::size_t num_thread = 1; - - switch(init_method) - { - case 0: break; - case 1: - in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - break; - case 2: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - default: - in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); - } - - if(beta != 0.0f) - for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) - out.mData[i] = out_ref.mData[i]; - - // these buffers are usually provided by the user application - DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); - DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); - - in_dev.ToDevice(in.mData.data()); - - if(beta != 0.0f) - out_dev.ToDevice(out.mData.data()); - - size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0; - - DeviceMem out_indices_dev(indicesSizeInBytes); - - using InElementwiseOperation_0 = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation_0 = - typename reduce_unary_operator:: - AccElementwiseOperation; - using InElementwiseOperation_1 = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation_1 = - typename reduce_unary_operator:: - AccElementwiseOperation; - using InElementwiseOperation_2 = - typename reduce_unary_operator:: - InElementwiseOperation; - using AccElementwiseOperation_2 = - typename reduce_unary_operator:: - AccElementwiseOperation; - - using DeviceReduceInstPtr0 = - DeviceReducePtr; - using DeviceReduceInstPtr1 = - DeviceReducePtr; - using DeviceReduceInstPtr2 = - DeviceReducePtr; - - std::vector reduce0_ptrs; - std::vector reduce1_ptrs; - std::vector reduce2_ptrs; - - add_device_reduce_instance_threadwise(reduce0_ptrs); - - add_device_reduce_instance_blockwise(reduce0_ptrs); - - add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); - - add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); - - if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) - { - throw std::runtime_error("Wrong! No device REDUCE instance found"); - }; - - bool result = true; - - ReductionHost - hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run( - alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); - - const auto i_inLengths = to_int_vector(inLengths); - const auto i_inStrides = to_int_vector(inStrides); - const auto i_outLengths = to_int_vector(outLengths); - const auto i_outStrides = to_int_vector(outStrides); - - for(auto& reduce_ptr : reduce0_ptrs) - { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); - - InElementwiseOperation_0 in_elementwise_op_0(static_cast(reduce_total_length)); - AccElementwiseOperation_0 acc_elementwise_op_0(static_cast(reduce_total_length)); - - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - in_elementwise_op_0, - acc_elementwise_op_0); - - if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) - continue; - - auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - - (void)invoker_ptr->Run(argument_ptr.get()); - - out_dev.FromDevice(out.mData.data()); - - bool single_result = true; - - if constexpr(std::is_same::value || - std::is_same::value) - { - reduce_util::to_f32_vector(out, out_fp32); - reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = ck::utils::check_err( - out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); - } - else - { - single_result = - ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); - }; - - if(NeedIndices) - { - out_indices_dev.FromDevice(out_indices.mData.data()); - single_result = single_result && ck::utils::check_err(out_indices_ref.mData, - out_indices.mData, - "Error: incorrect index result!"); - }; - - if(!single_result) - { - std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl; - result = false; - } - }; - - for(auto& reduce_ptr : reduce1_ptrs) - { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - - DeviceMem ws_dev(wsSizeInBytes); - - InElementwiseOperation_1 in_elementwise_op_1(static_cast(reduce_total_length)); - AccElementwiseOperation_1 acc_elementwise_op_1(static_cast(reduce_total_length)); - - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - in_elementwise_op_1, - acc_elementwise_op_1); - - if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) - continue; - - std::string reduce_name = reduce_ptr->GetTypeString(); - - auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - - (void)invoker_ptr->Run(argument_ptr.get()); - - std::vector inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get()); - std::vector inStrides2{inLengths2[1], 1}; - - for(auto& reduce2_ptr : reduce2_ptrs) - { - InElementwiseOperation_2 in_elementwise_op_2(static_cast(reduce_total_length)); - AccElementwiseOperation_2 acc_elementwise_op_2( - static_cast(reduce_total_length)); - - auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2, - inStrides2, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - ws_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - in_elementwise_op_2, - acc_elementwise_op_2); - - if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) - continue; - - std::string reduce2_name = reduce2_ptr->GetTypeString(); - - auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); - - (void)invoker2_ptr->Run(argument2_ptr.get()); - - out_dev.FromDevice(out.mData.data()); - - bool single_result = true; - - if constexpr(std::is_same::value || - std::is_same::value) - { - reduce_util::to_f32_vector(out, out_fp32); - reduce_util::to_f32_vector(out_ref, out_ref_fp32); - single_result = ck::utils::check_err( - out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); - } - else - { - single_result = - ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); - }; - - if(NeedIndices) - { - out_indices_dev.FromDevice(out_indices.mData.data()); - single_result = - single_result && ck::utils::check_err(out_indices_ref.mData, - out_indices.mData, - "Error: incorrect index result!"); - }; - - if(!single_result) - { - std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => " - << reduce2_ptr->GetTypeString() << std::endl; - result = false; - } - }; - }; - - return (result); -}; - -} // anonymous namespace - static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, {"reduceDimensions", required_argument, nullptr, 'R'}, {"scales", required_argument, nullptr, 'S'}, @@ -390,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr, class SimpleAppArgs { - template - static T getSingleValueFromString(const std::string& valueStr) - { - std::istringstream iss(valueStr); - - T ret; - - iss >> ret; - - return (ret); - }; - - template - static std::vector getTypeValuesFromString(const char* cstr_values) - { - std::string valuesStr(cstr_values); - - std::vector values; - std::size_t pos = 0; - std::size_t new_pos; - - new_pos = valuesStr.find(',', pos); - while(new_pos != std::string::npos) - { - const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); - - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - pos = new_pos + 1; - new_pos = valuesStr.find(',', pos); - }; - - std::string sliceStr = valuesStr.substr(pos); - T val = getSingleValueFromString(sliceStr); - - values.push_back(val); - - return (values); - }; - private: int option_index = 0; @@ -463,6 +44,8 @@ class SimpleAppArgs int processArgs(int argc, char* argv[]) { + using ck::host_common::getTypeValuesFromString; + int ch; while(1) @@ -517,7 +100,7 @@ class SimpleAppArgs (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4)) return (-1); - if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5) + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6) return (-1); return (0); @@ -528,87 +111,92 @@ bool test_reduce_with_index(int data_type, int init_method, std::vector reduceDims, std::vector inLengths, + ReduceTensorOp reduceOpId, + bool propagateNan, float alpha, float beta) { + using ck::profiler::profile_reduce_impl; + bool result = true; if(data_type == 0) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + true, + alpha, + beta); } else if(data_type == 1) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + true, + alpha, + beta); } else if(data_type == 3) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + true, + alpha, + beta); } else if(data_type == 5) { - switch(reduceDims.size()) - { - case 1: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 3: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - case 4: - result = test_reduce_with_index_impl( - init_method, inLengths, reduceDims, alpha, beta); - break; - }; + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + true, + alpha, + beta); + } + else if(data_type == 6) + { + result = profile_reduce_impl(true, + init_method, + false, + false, + inLengths, + reduceDims, + reduceOpId, + propagateNan, + true, + alpha, + beta); } return (result); }; +constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AMAX; +constexpr bool propagateNan = false; + int main(int argc, char* argv[]) { SimpleAppArgs args; @@ -624,8 +212,14 @@ int main(int argc, char* argv[]) {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}}; for(auto& reduceDims : v_reduceDims) - result = result && test_reduce_with_index( - data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f); + result = result && test_reduce_with_index(data_type, + init_method, + reduceDims, + inLengths, + reduceOpId, + propagateNan, + 1.0f, + 0.0f); } else { @@ -639,6 +233,8 @@ int main(int argc, char* argv[]) args.init_method, args.reduceDims, args.inLengths, + reduceOpId, + propagateNan, args.scales[0], args.scales[1]); }