From cc5b464fe15636afe1e543fe1d3ffe6e04b1b9c0 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Sat, 13 Aug 2022 14:10:01 +0800 Subject: [PATCH] Add examples for reduction fp16/fp32/bp16/int8/fp64 for 3d/4d/5d (#342) * Update the reduce_blockwise example to support user specified data type and input+reducing dimensions * Add examples for using reduce_multiblock_atomic_add * Add more running examples to the default command-line * Remove un-necessary header including * Update to the example README.md [ROCm/composable_kernel commit: 14932e8de36c09c247504f9335110ce6ca0ce502] --- example/12_reduce/CMakeLists.txt | 1 + example/12_reduce/README.md | 35 +- example/12_reduce/reduce_blockwise.cpp | 394 +++++++----------- example/12_reduce/reduce_blockwise_impl.hpp | 275 ++++++++++++ example/12_reduce/reduce_example_common.hpp | 48 +++ .../reduce_multiblock_atomic_add.cpp | 212 ++++++++++ .../reduce_multiblock_atomic_add_impl.hpp | 230 ++++++++++ 7 files changed, 952 insertions(+), 243 deletions(-) create mode 100644 example/12_reduce/reduce_blockwise_impl.hpp create mode 100644 example/12_reduce/reduce_example_common.hpp create mode 100644 example/12_reduce/reduce_multiblock_atomic_add.cpp create mode 100644 example/12_reduce/reduce_multiblock_atomic_add_impl.hpp diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt index 9045a78a85..6e58ed9338 100644 --- a/example/12_reduce/CMakeLists.txt +++ b/example/12_reduce/CMakeLists.txt @@ -1,2 +1,3 @@ add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) +add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp) add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp) diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md index 826d2f6c33..76d28527bb 100644 --- a/example/12_reduce/README.md +++ b/example/12_reduce/README.md @@ -2,20 +2,41 @@ ## Run ```example_reduce_blockwise``` ```bash -# -D : input 4-d tensor lengths +# -D : input 3d/4d/5d tensor lengths +# -R : reduce dimension ids # -v : verification (0=no, 1=yes) -#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) -#arg2: time kernel (0=no, 1=yes) -./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1 +#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4) +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg3: time kernel (0=no, 1=yes) +./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1 ``` Result ``` -./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1 -launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} +./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1 +launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} Warm up 1 time Start running 10 times... -Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> +Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> +``` + +## Run ```example_reduce_multiblock_atomic_add``` +```bash +# -D : input 3d/4d/5d tensor lengths +# -R : reduce dimension ids +# -v : verification (0=no, 1=yes) +#arg1: data type (0: fp32, 1: fp64) +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg3: time kernel (0=no, 1=yes) +./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0 +``` + +Result +``` +./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0 +Perf: 0 ms, inf GB/s, DeviceReduceMultiBlock<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> +echo $? +0 ``` # Instructions for ```example_reduce_blockwise_two_call``` diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index a410f2a055..7cebbefb62 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -2,64 +2,17 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include -#include #include #include #include -#include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_reduction.hpp" +#include "reduce_blockwise_impl.hpp" +#include "reduce_example_common.hpp" using namespace ck; using namespace ck::tensor_operation::device; -using InDataType = ck::half_t; -using OutDataType = ck::half_t; -using AccDataType = float; - -constexpr int Rank = 4; -constexpr int NumReduceDim = 3; - -constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2; -constexpr bool PropagateNan = true; -constexpr bool OutputIndex = false; - -using ReduceOperation = typename reduce_binary_operator::opType; -using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; -using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; - -using DeviceReduceInstance = DeviceReduceMultiBlock; - static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, @@ -72,10 +25,12 @@ class SimpleAppArgs public: std::vector inLengths = {16, 64, 32, 960}; + std::vector reduceDims = {0, 1, 2}; std::vector scales = {1.0f, 0.0f}; bool do_verification = true; - int init_method = 1; + int data_type = 1; + int init_method = 2; bool time_kernel = true; public: @@ -84,13 +39,17 @@ class SimpleAppArgs std::cout << "Usage of " << cmd << std::endl; std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" << std::endl; + std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions" + << std::endl; std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " "comparing with the host-based reduction" << std::endl; - std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " + std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)" + << std::endl; + std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer " "value, 3=decimal value)" << std::endl; - std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl; + std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl; }; int processArgs(int argc, char* argv[]) @@ -101,7 +60,7 @@ class SimpleAppArgs while(1) { - ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index); + ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index); if(ch == -1) break; switch(ch) @@ -112,6 +71,12 @@ class SimpleAppArgs inLengths = getTypeValuesFromString(optarg); break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceDims = getTypeValuesFromString(optarg); + break; case 'v': if(!optarg) throw std::runtime_error("Invalid option format!"); @@ -129,9 +94,12 @@ class SimpleAppArgs }; }; - if(optind + 2 > argc) + if(optind + 3 > argc) + { throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + }; + data_type = std::atoi(argv[optind++]); init_method = std::atoi(argv[optind++]); time_kernel = static_cast(std::atoi(argv[optind])); @@ -145,198 +113,152 @@ class SimpleAppArgs }; }; +template +bool reduce_blockwise_test(bool do_verification, + int init_method, + bool time_kernel, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) +{ + bool matched = false; + int result = 0; + + const auto tuple_object = reduce_shape_instances{}; + + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + if(matched) + return; + + using ShapeType = remove_cvref_t(tuple_object))>; + + if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) + return; + + result = reduce_blockwise_impl( + do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); + + matched = true; + }); + + return (result == 0) ? true : false; +}; + +constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; +constexpr bool PropagateNan = true; +constexpr bool OutputIndex = false; + int main(int argc, char* argv[]) { - const std::vector reduceDims{0, 1, 2}; - const std::vector invariantDims{3}; - - SimpleAppArgs args; + bool pass = true; if(argc > 1) { - if(args.processArgs(argc, argv) < 0) + SimpleAppArgs arg; + + if(arg.processArgs(argc, argv) < 0) return (-1); - }; - constexpr bool op_support_indices = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - - // if input is half type, no reason to use float for indiced reduction operation and must use - // float for non-indiced reduction operation for accuracy - constexpr bool invalid_reduce_1 = - std::is_same::value && - ((!op_support_indices && !std::is_same::value) || - (op_support_indices && !std::is_same::value)); - - // if input is float type, no reason to use double for indiced reduction operation - constexpr bool invalid_reduce_2 = - std::is_same::value && - (op_support_indices && !std::is_same::value); - - // indices option can only be used when it is really needed - constexpr bool invalid_reduce_3 = (!op_support_indices && OutputIndex); - - constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3); - - if constexpr(invalid_reduce) - std::cout << "Reduction setting is not supported, exiting!" << std::endl; - - Tensor in(args.inLengths); - - std::vector outLengths; - - if(invariantDims.empty()) - outLengths.push_back(1); - else - for(auto dim : invariantDims) - outLengths.push_back(args.inLengths[dim]); - - Tensor out_ref(outLengths); - Tensor out(outLengths); - Tensor out_indices_ref(outLengths); - Tensor out_indices(outLengths); - - auto inStrides = in.mDesc.GetStrides(); - auto outStrides = out.mDesc.GetStrides(); - - size_t invariant_total_length = out.mDesc.GetElementSize(); - size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; - - float alpha = args.scales[0]; - float beta = args.scales[1]; - - std::size_t num_thread = 1; - - if(args.do_verification) - { - switch(args.init_method) + if(arg.data_type == 0) { - case 0: break; - case 1: - in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - break; - case 2: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - default: - in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + pass = reduce_blockwise_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); } - - if(beta != 0.0f) - for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) - out.mData[i] = out_ref.mData[i]; - }; - - // these buffers are usually provided by the user application - DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); - DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); - - in_dev.ToDevice(in.mData.data()); - - if(beta != 0.0f) - out_dev.ToDevice(out.mData.data()); - - size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0; - - DeviceMem out_index_dev(indicesSizeInBytes); - - InElementwiseOperation in_elementwise_op; - AccElementwiseOperation acc_elementwise_op; - - std::tie(in_elementwise_op, acc_elementwise_op) = - reduce_unary_operator::GetElementwiseOperator( - static_cast(reduce_total_length)); - - if(args.do_verification) - { - ReductionHost - hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run(alpha, - in.mData.data(), - beta, - out_ref.mData.data(), - out_indices_ref.mData.data(), - in_elementwise_op, - acc_elementwise_op); - }; - - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; - - i_inLengths.assign(args.inLengths.begin(), args.inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); - - auto reduce = DeviceReduceInstance{}; - - auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - nullptr, - out_dev.GetDeviceBuffer(), - out_index_dev.GetDeviceBuffer(), - in_elementwise_op, - acc_elementwise_op); - - if(!reduce.IsSupportedArgument(argument_ptr.get())) - { - std::cout - << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" - << std::endl; - }; - - std::string reduce_name = reduce.GetTypeString(); - - auto invoker_ptr = reduce.MakeInvokerPointer(); - - float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel}); - - std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + - invariant_total_length * sizeof(OutDataType); - - float gb_per_sec = num_bytes / 1.E6 / avg_time; - - std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name - << std::endl; - - bool pass = true; - - if(args.do_verification) - { - out_dev.FromDevice(out.mData.data()); - pass = pass && ck::utils::check_err(out.mData, out_ref.mData); - - if(OutputIndex) + else if(arg.data_type == 1) { - out_index_dev.FromDevice(out_indices.mData.data()); - pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); - }; + pass = reduce_blockwise_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); + } + else if(arg.data_type == 3) + { + pass = reduce_blockwise_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); + } + else if(arg.data_type == 5) + { + pass = reduce_blockwise_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); + } + else if(arg.data_type == 6) + { + pass = reduce_blockwise_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); + } + } + else + { + // for testing half_t + pass = + pass && reduce_blockwise_test( + true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing float + pass = pass && reduce_blockwise_test( + true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing double + pass = pass && reduce_blockwise_test( + true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing bhalf_t + pass = pass && + reduce_blockwise_test( + true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing int8_t + pass = + pass && reduce_blockwise_test( + true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing 3D input + pass = pass && reduce_blockwise_test( + true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f); + + // for testing 5D input + pass = pass && reduce_blockwise_test( + true, 2, true, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f); }; return (pass ? 0 : 1); -} +}; diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp new file mode 100644 index 0000000000..c185773f63 --- /dev/null +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_reduction.hpp" + +#include "reduce_example_common.hpp" + +template +int reduce_blockwise_impl(bool do_verification, + int init_method, + bool time_kernel, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) + +{ + using namespace ck; + using namespace ck::tensor_operation::device; + + constexpr bool op_support_indices = + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); + + constexpr bool invalid_reduce_1 = OutputIndex && !op_support_indices; + + // 1) If InOutDataType is half_t, must use half_t as AccDataType for indexable reduction + // operations 2) If InOutDataType is half_t, must use float as AccDataType for non-indexable + // reduction operations + constexpr bool invalid_reduce_2 = + std::is_same::value && + ((!op_support_indices && !std::is_same::value) || + (op_support_indices && !std::is_same::value)); + + // 1) If InOutDataType is float, must use float as AccDataType for indexable reduction + // operations + constexpr bool invalid_reduce_3 = + std::is_same::value && + (op_support_indices && !std::is_same::value); + + // 1) If InOutDataType is int8_t, must use int8_t as AccDataType for indexable reduction + // operations 2) If InOutDataType is int8_t, must use int32_t as AccDataType for non-indexable + // reduction operations + constexpr bool invalid_reduce_4 = + std::is_same::value && + ((!op_support_indices && !std::is_same::value) || + (op_support_indices && !std::is_same::value)); + + // 1) If InOutDataType is int8_t, the supported operation must be either indexable operations or + // ADD/AVG + constexpr bool invalid_reduce_5 = std::is_same::value && + (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD && + ReduceOpId != ReduceTensorOp::AVG); + + // 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations + constexpr bool invalid_reduce_6 = + std::is_same::value && !std::is_same::value; + + constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || + invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6); + + if(invalid_reduce) + { + std::cerr << "The reduction setting is invalid, exiting!" << std::endl; + return (-1); + }; + + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator::AccElementwiseOperation; + + using DeviceReduceInstance = + ck::tensor_operation::device::DeviceReduceMultiBlock; // OutDstVectorSize + + Tensor in(inLengths); + + std::vector outLengths; + + std::vector invariantDims = get_invariant_dims(reduceDims); + + if(invariantDims.empty()) + outLengths.push_back(1); + else + for(auto dim : invariantDims) + outLengths.push_back(inLengths[dim]); + + Tensor out_ref(outLengths); + Tensor out(outLengths); + Tensor out_indices_ref(outLengths); + Tensor out_indices(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; + + std::size_t num_thread = 1; + + if(do_verification) + { + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, + num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) + out.mData[i] = out_ref.mData[i]; + }; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize()); + + in_dev.ToDevice(in.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0; + + DeviceMem out_index_dev(indicesSizeInBytes); + + InElementwiseOperation in_elementwise_op; + AccElementwiseOperation acc_elementwise_op; + + std::tie(in_elementwise_op, acc_elementwise_op) = + reduce_unary_operator::GetElementwiseOperator( + static_cast(reduce_total_length)); + + if(do_verification) + { + ReductionHost + hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); + + hostReduce.Run(alpha, + in.mData.data(), + beta, + out_ref.mData.data(), + out_indices_ref.mData.data(), + in_elementwise_op, + acc_elementwise_op); + }; + + std::vector i_inLengths; + std::vector i_inStrides; + std::vector i_outLengths; + std::vector i_outStrides; + + i_inLengths.assign(inLengths.begin(), inLengths.end()); + i_inStrides.assign(inStrides.begin(), inStrides.end()); + i_outLengths.assign(outLengths.begin(), outLengths.end()); + i_outStrides.assign(outStrides.begin(), outStrides.end()); + + auto reduce = DeviceReduceInstance{}; + + auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + nullptr, + out_dev.GetDeviceBuffer(), + out_index_dev.GetDeviceBuffer(), + in_elementwise_op, + acc_elementwise_op); + + if(!reduce.IsSupportedArgument(argument_ptr.get())) + { + std::cerr + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + + return (-2); + }; + + std::string reduce_name = reduce.GetTypeString(); + + auto invoker_ptr = reduce.MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) + + invariant_total_length * sizeof(InOutDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name + << std::endl; + + bool pass = true; + + if(do_verification) + { + out_dev.FromDevice(out.mData.data()); + pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + + if(OutputIndex) + { + out_index_dev.FromDevice(out_indices.mData.data()); + pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + }; + }; + + return (pass ? 0 : 1); +} diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp new file mode 100644 index 0000000000..6334f608e3 --- /dev/null +++ b/example/12_reduce/reduce_example_common.hpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" + +template +std::vector get_invariant_dims(const std::vector& reduceDims) +{ + assert(NumReduceDim == reduceDims.size()); + + int reduceFlag = 0; + + // flag the bits for the reduceDims + for(int i = 0; i < NumReduceDim; i++) + { + reduceFlag |= 1 << reduceDims[i]; + }; + + std::vector invariantDims; + + // collect invariant dimensions + for(int i = 0; i < Rank; i++) + if((reduceFlag & (1 << i)) == 0) + { + invariantDims.push_back(i); + }; + + return invariantDims; +}; + +template +struct ReduceShape +{ + static constexpr ck::index_t Rank_ = Rank; + static constexpr ck::index_t NumReduceDim_ = NumReduceDim; +}; + +using reduce_shape_instances = std::tuple, + ReduceShape<3, 2>, + ReduceShape<4, 1>, + ReduceShape<4, 2>, + ReduceShape<4, 3>, + ReduceShape<5, 1>, + ReduceShape<5, 2>, + ReduceShape<5, 3>, + ReduceShape<5, 4>>; diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp new file mode 100644 index 0000000000..9b56598ca3 --- /dev/null +++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/utility/reduction_enums.hpp" +#include "reduce_multiblock_atomic_add_impl.hpp" +#include "reduce_example_common.hpp" + +using namespace ck; +using namespace ck::tensor_operation::device; + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class SimpleAppArgs +{ + private: + int option_index = 0; + + public: + std::vector inLengths = {16, 64, 32, 960}; + std::vector reduceDims = {0, 1, 2}; + std::vector scales = {1.0f, 0.0f}; + + bool do_verification = true; + int data_type = 1; + int init_method = 2; + bool time_kernel = true; + + public: + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" + << std::endl; + std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions" + << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " + "comparing with the host-based reduction" + << std::endl; + std::cout << "Arg1: data type (0: fp32, 1: fp64)" << std::endl; + std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceDims = getTypeValuesFromString(optarg); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 3 > argc) + { + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + }; + + data_type = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind])); + + if(scales.empty()) + { + scales.push_back(1.0f); + scales.push_back(0.0f); + }; + + return (0); + }; +}; + +template +bool reduce_multiblock_atomic_add_test(bool do_verification, + int init_method, + bool time_kernel, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) +{ + bool matched = false; + int result = 0; + + const auto tuple_object = reduce_shape_instances{}; + + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + if(matched) + return; + + using ShapeType = remove_cvref_t(tuple_object))>; + + if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) + return; + + result = reduce_multiblock_atomic_add_impl( + do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); + + matched = true; + }); + + return (result == 0) ? true : false; +}; + +constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; +constexpr bool PropagateNan = true; + +int main(int argc, char* argv[]) +{ + bool pass = true; + + if(argc > 1) + { + SimpleAppArgs arg; + + if(arg.processArgs(argc, argv) < 0) + return (-1); + + if(arg.data_type == 0) + { + pass = reduce_multiblock_atomic_add_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); + } + else if(arg.data_type == 1) + { + pass = reduce_multiblock_atomic_add_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); + } + } + else + { + // for testing float + pass = pass && reduce_multiblock_atomic_add_test( + true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing double + pass = pass && reduce_multiblock_atomic_add_test( + true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); + + // for testing 3D input + pass = pass && reduce_multiblock_atomic_add_test( + true, 2, false, {16, 64, 960}, {0, 1}, 1.0f, 0.0f); + + // for testing 5D input + pass = pass && reduce_multiblock_atomic_add_test( + true, 2, false, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f); + }; + + return (pass ? 0 : 1); +}; diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp new file mode 100644 index 0000000000..c2fa8da914 --- /dev/null +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_reduction.hpp" + +#include "reduce_example_common.hpp" + +template +int reduce_multiblock_atomic_add_impl(bool do_verification, + int init_method, + bool time_kernel, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) + +{ + using namespace ck; + using namespace ck::tensor_operation::device; + + constexpr bool op_support_atomic_add = + (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG); + + constexpr bool invalid_reduce_1 = !op_support_atomic_add; + constexpr bool invalid_reduce_2 = + !(std::is_same::value || std::is_same::value); + + constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2); + + if(invalid_reduce) + { + std::cerr << "The reduction setting is invalid, exiting!" << std::endl; + return (-1); + }; + + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator::AccElementwiseOperation; + + using DeviceReduceInstance = + ck::tensor_operation::device::DeviceReduceMultiBlock; + + Tensor in(inLengths); + + std::vector outLengths; + + std::vector invariantDims = get_invariant_dims(reduceDims); + + if(invariantDims.empty()) + outLengths.push_back(1); + else + for(auto dim : invariantDims) + outLengths.push_back(inLengths[dim]); + + Tensor out_ref(outLengths); + Tensor out(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; + + std::size_t num_thread = 1; + + if(do_verification) + { + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, + num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) + out.mData[i] = out_ref.mData[i]; + }; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize()); + + in_dev.ToDevice(in.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + InElementwiseOperation in_elementwise_op; + AccElementwiseOperation acc_elementwise_op; + + std::tie(in_elementwise_op, acc_elementwise_op) = + reduce_unary_operator::GetElementwiseOperator( + static_cast(reduce_total_length)); + + if(do_verification) + { + ReductionHost + hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); + + hostReduce.Run(alpha, + in.mData.data(), + beta, + out_ref.mData.data(), + nullptr, + in_elementwise_op, + acc_elementwise_op); + }; + + std::vector i_inLengths; + std::vector i_inStrides; + std::vector i_outLengths; + std::vector i_outStrides; + + i_inLengths.assign(inLengths.begin(), inLengths.end()); + i_inStrides.assign(inStrides.begin(), inStrides.end()); + i_outLengths.assign(outLengths.begin(), outLengths.end()); + i_outStrides.assign(outStrides.begin(), outStrides.end()); + + auto reduce = DeviceReduceInstance{}; + + auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + nullptr, + out_dev.GetDeviceBuffer(), + nullptr, + in_elementwise_op, + acc_elementwise_op); + + if(!reduce.IsSupportedArgument(argument_ptr.get())) + { + std::cerr + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + + return (-2); + }; + + std::string reduce_name = reduce.GetTypeString(); + + auto invoker_ptr = reduce.MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) + + invariant_total_length * sizeof(InOutDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name + << std::endl; + + bool pass = true; + + if(do_verification) + { + out_dev.FromDevice(out.mData.data()); + pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + }; + + return (pass ? 0 : 1); +}