Overhaul to Reducton and its dependants (#237)

* Tiny fix in dynamic_buffer.hpp to support vectorized AtomicAdd for double type * Update to host layer and host reduction * Merge and remove reduction kernels * Merge and remove reduction device interfaces and update pooling device interface * Merge and remove useless reduction device instances * Update to reduction profiler and reduction ctests * Update to reduction and pooling examples and add one reduction example * Change to reduction examples to let them testable by ctest * Add explicit pass checking for reduction and pooling examples * Explicit assignment of tensor shapes in example reduce_blockwise_two_call * Use atomic_add to repace atomicAdd and add atomic_add for double type * Add reduce ctest support for double data type * Replace to_int_vector() by using c++ std::vector::assign() * Keep DeviceReduceThreadWise separated from DeviceReduceBlockWise * Merge DeviceReduceBlockWise and DeviceReduceMultiBlockAtomicAdd into DeviceReduceMultiBlock * Add GetAtomicOperationZeroValue() support for AtomicMax * Tiny change to reduce example README.md * Fix some tiny issues due to branch merging * Revoke previous change in dynamic_buffer.hpp and add atomic_add for double2_t * Add reduce multiblock_atomic_add instances for fp64 to verify vectorized atomic_add on fp64 * Renaming * Clean the header includings in device_reduce instances header files
2026-04-19 22:39:03 +00:00 · 2022-05-25 01:19:12 +08:00
parent 1085794df3
commit 63eee2d999
94 changed files with 2429 additions and 6785 deletions
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -1,384 +1,10 @@
 #include "getopt.h"

-#include "check_err.hpp"
-#include "device_reduce_instance.hpp"
-#include "reduction_enums.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_reduction.hpp"
-#include "reduce_util.hpp"
+#include "host_common_util.hpp"
+#include "profile_reduce_impl.hpp"

 using namespace ck;

-namespace {
-
-template <index_t Rank, index_t NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
-{
-    assert(NumReduceDim == reduceDims.size());
-
-    int reduceFlag = 0;
-
-    // flag the bits for the reduceDims
-    for(int i = 0; i < NumReduceDim; i++)
-    {
-        reduceFlag |= 1 << reduceDims[i];
-    };
-
-    std::vector<int> invariantDims;
-
-    // collect invariant dimensions
-    for(int i = 0; i < Rank; i++)
-        if((reduceFlag & (1 << i)) == 0)
-        {
-            invariantDims.push_back(i);
-        };
-
-    return invariantDims;
-};
-
-constexpr int Rank = 4;
-
-constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AVG;
-constexpr NanPropagation NanOpt          = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan              = false;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
-constexpr bool NeedIndices               = false;
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim>
-bool test_reduce_no_index_impl(int init_method,
-                               const std::vector<size_t>& inLengths,
-                               const std::vector<int>& reduceDims,
-                               float alpha,
-                               float beta)
-{
-    using namespace ck::tensor_operation::device;
-    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
-
-    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
-    constexpr bool op_support_atomic_add  = true;
-    constexpr bool use_atomic_add         = (out_support_atomic_add && op_support_atomic_add);
-
-    Tensor<InDataType> in(inLengths);
-
-    std::vector<size_t> outLengths;
-
-    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
-
-    if(reduceDims.size() == Rank)
-        outLengths.push_back(1);
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(inLengths[dim]);
-
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-
-    // only used when the OutDataType is bhalf_t
-    Tensor<float> out_ref_fp32(outLengths);
-    Tensor<float> out_fp32(outLengths);
-
-    auto inStrides  = in.mDesc.GetStrides();
-    auto outStrides = out.mDesc.GetStrides();
-
-    size_t invariant_total_length = out.mDesc.GetElementSize();
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
-
-    std::size_t num_thread = 1;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-    }
-
-    if(beta != 0.0f)
-        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
-            out.mData[i] = out_ref.mData[i];
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
-
-    in_dev.ToDevice(in.mData.data());
-
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-
-    using InElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
-    using AccElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-
-    using DeviceReduceInstPtr0 =
-        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-    using DeviceReduceInstPtr1 =
-        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-    using DeviceReduceInstPtr2 =
-        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
-
-    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
-
-    add_device_reduce_instance_threadwise<InDataType,
-                                          AccDataType,
-                                          OutDataType,
-                                          Rank,
-                                          NumReduceDim,
-                                          ReduceOpId,
-                                          NanOpt,
-                                          IndicesOpt>(reduce0_ptrs);
-
-    add_device_reduce_instance_blockwise<InDataType,
-                                         AccDataType,
-                                         OutDataType,
-                                         Rank,
-                                         NumReduceDim,
-                                         ReduceOpId,
-                                         NanOpt,
-                                         IndicesOpt>(reduce0_ptrs);
-
-    if constexpr(use_atomic_add)
-    {
-        add_device_reduce_instance_multiblock_atomic_add<InDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce0_ptrs);
-    }
-    else
-    {
-        add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             Rank,
-                                                             NumReduceDim,
-                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce1_ptrs);
-    };
-
-    // used for secondary reduction
-    if constexpr(!use_atomic_add)
-    {
-        add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce2_ptrs);
-    };
-
-    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
-    {
-        throw std::runtime_error("Wrong! No device REDUCE instance found");
-    };
-
-    bool result = true;
-
-    ReductionHost<InDataType,
-                  AccDataType,
-                  OutDataType,
-                  ReduceOpId,
-                  Rank,
-                  NumReduceDim,
-                  PropagateNan,
-                  NeedIndices>
-        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-
-    hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
-
-    const auto i_inLengths  = to_int_vector(inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
-
-    for(auto& reduce_ptr : reduce0_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-        DeviceMem ws_dev(wsSizeInBytes);
-
-        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_0,
-                                                            acc_elementwise_op_0);
-
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-        (void)invoker_ptr->Run(argument_ptr.get());
-
-        out_dev.FromDevice(out.mData.data());
-
-        bool single_result = true;
-
-        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                     std::is_same<OutDataType, ck::bhalf_t>::value)
-        {
-            reduce_util::to_f32_vector(out, out_fp32);
-            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-            single_result = ck::utils::check_err(
-                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-        }
-        else
-        {
-            single_result =
-                ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-        };
-
-        if(!single_result)
-        {
-            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
-            result = false;
-        }
-    };
-
-    for(auto& reduce_ptr : reduce1_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-        DeviceMem ws_dev(wsSizeInBytes);
-
-        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_1,
-                                                            acc_elementwise_op_1);
-
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-        (void)invoker_ptr->Run(argument_ptr.get());
-
-        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-        std::vector<int> inStrides2{inLengths2[1], 1};
-
-        for(auto& reduce2_ptr : reduce2_ptrs)
-        {
-            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_2 acc_elementwise_op_2(
-                static_cast<int32_t>(reduce_total_length));
-
-            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                                  inStrides2,
-                                                                  i_outLengths,
-                                                                  i_outStrides,
-                                                                  reduceDims,
-                                                                  alpha,
-                                                                  beta,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  out_dev.GetDeviceBuffer(),
-                                                                  nullptr,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  in_elementwise_op_2,
-                                                                  acc_elementwise_op_2);
-
-            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                continue;
-
-            std::string reduce2_name = reduce2_ptr->GetTypeString();
-
-            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-
-            (void)invoker2_ptr->Run(argument2_ptr.get());
-
-            out_dev.FromDevice(out.mData.data());
-
-            bool single_result = true;
-
-            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                         std::is_same<OutDataType, ck::bhalf_t>::value)
-            {
-                reduce_util::to_f32_vector(out, out_fp32);
-                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-                single_result = ck::utils::check_err(
-                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-            }
-            else
-            {
-                single_result =
-                    ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-            };
-
-            if(!single_result)
-            {
-                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
-                          << reduce2_ptr->GetTypeString() << std::endl;
-                result = false;
-            }
-        };
-    };
-
-    return (result);
-};
-
-} // anonymous namespace
-
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"reduceDimensions", required_argument, nullptr, 'R'},
                                       {"scales", required_argument, nullptr, 'S'},
@@ -387,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,

 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-
-        T ret;
-
-        iss >> ret;
-
-        return (ret);
-    };
-
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-
-            T val = getSingleValueFromString<T>(sliceStr);
-
-            values.push_back(val);
-
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-
-        values.push_back(val);
-
-        return (values);
-    };
-
    private:
    int option_index = 0;

@@ -460,6 +44,8 @@ class SimpleAppArgs

    int processArgs(int argc, char* argv[])
    {
+        using ck::host_common::getTypeValuesFromString;
+
        int ch;

        while(1)
@@ -514,7 +100,7 @@ class SimpleAppArgs
           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
            return (-1);

-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
            return (-1);

        return (0);
@@ -525,87 +111,92 @@ bool test_reduce_no_index(int data_type,
                          int init_method,
                          std::vector<int> reduceDims,
                          std::vector<size_t> inLengths,
+                          ReduceTensorOp reduceOpId,
+                          bool propagateNan,
                          float alpha,
                          float beta)
 {
+    using ck::profiler::profile_reduce_impl;
+
    bool result = true;

    if(data_type == 0)
    {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<float, float, float, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<float, float, float, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<float, float, float, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<float, float, float>(true,
+                                                          init_method,
+                                                          false,
+                                                          false,
+                                                          inLengths,
+                                                          reduceDims,
+                                                          reduceOpId,
+                                                          propagateNan,
+                                                          false,
+                                                          alpha,
+                                                          beta);
    }
    else if(data_type == 1)
    {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
+                                                                    init_method,
+                                                                    false,
+                                                                    false,
+                                                                    inLengths,
+                                                                    reduceDims,
+                                                                    reduceOpId,
+                                                                    propagateNan,
+                                                                    false,
+                                                                    alpha,
+                                                                    beta);
    }
    else if(data_type == 3)
    {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
+                                                              init_method,
+                                                              false,
+                                                              false,
+                                                              inLengths,
+                                                              reduceDims,
+                                                              reduceOpId,
+                                                              propagateNan,
+                                                              false,
+                                                              alpha,
+                                                              beta);
    }
    else if(data_type == 5)
    {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
+                                                                      init_method,
+                                                                      false,
+                                                                      false,
+                                                                      inLengths,
+                                                                      reduceDims,
+                                                                      reduceOpId,
+                                                                      propagateNan,
+                                                                      false,
+                                                                      alpha,
+                                                                      beta);
+    }
+    else if(data_type == 6)
+    {
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             false,
+                                                             alpha,
+                                                             beta);
    }

    return (result);
 };

+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+constexpr bool propagateNan         = false;
+
 int main(int argc, char* argv[])
 {
    SimpleAppArgs args;
@@ -621,8 +212,14 @@ int main(int argc, char* argv[])
            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};

        for(auto& reduceDims : v_reduceDims)
-            result = result && test_reduce_no_index(
-                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+            result = result && test_reduce_no_index(data_type,
+                                                    init_method,
+                                                    reduceDims,
+                                                    inLengths,
+                                                    reduceOpId,
+                                                    propagateNan,
+                                                    1.0f,
+                                                    0.0f);
    }
    else
    {
@@ -636,6 +233,8 @@ int main(int argc, char* argv[])
                                      args.init_method,
                                      args.reduceDims,
                                      args.inLengths,
+                                      reduceOpId,
+                                      propagateNan,
                                      args.scales[0],
                                      args.scales[1]);
    }