composable_kernel/example/12_reduce/reduce_blockwise_two_call.cpp

#include <iostream>
#include <numeric>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>

#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_multiblock.hpp"
#include "host_common_util.hpp"
#include "host_reduction.hpp"

#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"

using namespace ck;
using namespace ck::tensor_operation::device;

using InOutDataType = ck::half_t;
using InOutDataType = ck::half_t;
using AccDataType   = float;

constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr bool PropagateNan         = true;
constexpr bool OutputIndex          = false;

using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation =
    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation =
    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;

using PassThroughOp = tensor_operation::element_wise::PassThrough;

using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
                                                      AccDataType,
                                                      InOutDataType,
                                                      5, // Rank
                                                      1, // NumReduceDim
                                                      ReduceOperation,
                                                      InElementwiseOperation,
                                                      PassThroughOp,
                                                      InMemoryDataOperationEnum::Set,
                                                      PropagateNan,
                                                      OutputIndex,
                                                      false, // HaveIndexInputIfOutputIndex
                                                      256,
                                                      32,
                                                      8,
                                                      1,
                                                      1,
                                                      1, // vector dim
                                                      1,
                                                      1>;

using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
                                                      AccDataType,
                                                      InOutDataType,
                                                      4, // Rank
                                                      1, // NumReduceDim
                                                      ReduceOperation,
                                                      PassThroughOp,
                                                      AccElementwiseOperation,
                                                      InMemoryDataOperationEnum::Set,
                                                      PropagateNan,
                                                      OutputIndex,
                                                      false, // HaveIndexInputIfOutputIndex
                                                      256,
                                                      128,
                                                      2,
                                                      1,
                                                      1,
                                                      1, // vector dim
                                                      1,
                                                      1>;

static bool do_verify;
static int init_method;
static float alpha;
static float beta;
static bool time_kernel;

int main(int argc, char* argv[])
{
    // used by the device reduction
    const std::vector<int> reduceDims_1    = {4};
    const std::vector<int> invariantDims_1 = {0, 1, 2, 3};

    const std::vector<int> reduceDims_2    = {3};
    const std::vector<int> invariantDims_2 = {0, 1, 2};

    // used by the host reduction
    const std::vector<int> reduceDims    = {3, 4};
    const std::vector<int> invariantDims = {0, 1, 2};

    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};

    // input lengths of the second reduction, which is also the output lengths of the first
    // reduction
    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};

    const std::vector<size_t> outLengths = {64, 320, 80};

    if(argc == 1)
    {
        do_verify   = true;
        init_method = 2;
        time_kernel = true;
    }
    else if(argc == 4)
    {
        do_verify   = static_cast<bool>(argv[1]);
        init_method = atoi(argv[2]);
        time_kernel = static_cast<bool>(atoi(argv[3]));
    }
    else
    {
        std::ostringstream ostr;

        ostr << "Wrong parameter! " << std::endl
             << "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;

        throw std::runtime_error(ostr.str());
    };

    alpha = 1.0f;
    beta  = 0.0f;

    Tensor<InOutDataType> in_1(inLengths_1);

    Tensor<InOutDataType> out_ref(outLengths);
    Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
    Tensor<InOutDataType> out(outLengths);

    auto inStrides_1 = in_1.mDesc.GetStrides();
    auto inStrides_2 = in_2.mDesc.GetStrides();
    auto outStrides  = out.mDesc.GetStrides();

    size_t invariant_total_length = out.mDesc.GetElementSize();
    size_t reduce_total_length    = in_1.mDesc.GetElementSize() / invariant_total_length;

    std::size_t num_thread = 1;

    if(do_verify)
    {
        switch(init_method)
        {
        case 0: break;
        case 1:
            in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
            if(beta != 0.0f)
                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
            break;
        case 2:
            in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
            if(beta != 0.0f)
                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
            break;
        default:
            in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
            if(beta != 0.0f)
                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
                                            num_thread);
        }

        if(beta != 0.0f)
            for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
                out.mData[i] = out_ref.mData[i];
    };

    DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpace());
    DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpace());
    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpace());

    in_1_dev.ToDevice(in_1.mData.data());

    if(beta != 0.0f)
        out_dev.ToDevice(out.mData.data());

    InElementwiseOperation in_elementwise_op;
    AccElementwiseOperation acc_elementwise_op;

    std::tie(in_elementwise_op, acc_elementwise_op) =
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

    if(do_verify)
    {
        ReductionHost<InOutDataType,
                      AccDataType,
                      InOutDataType,
                      ReduceOperation,
                      InElementwiseOperation,
                      AccElementwiseOperation,
                      5, // Rank
                      2, // NumReduceDim
                      PropagateNan,
                      OutputIndex>
            hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);

        hostReduce.Run(alpha,
                       in_1.mData.data(),
                       beta,
                       out_ref.mData.data(),
                       nullptr,
                       in_elementwise_op,
                       acc_elementwise_op);
    };

    std::vector<ck::index_t> i_inLengths_1;
    std::vector<ck::index_t> i_inStrides_1;
    std::vector<ck::index_t> i_inLengths_2;
    std::vector<ck::index_t> i_inStrides_2;
    std::vector<ck::index_t> i_outLengths;
    std::vector<ck::index_t> i_outStrides;

    i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end());
    i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end());
    i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end());
    i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end());
    i_outLengths.assign(outLengths.begin(), outLengths.end());
    i_outStrides.assign(outStrides.begin(), outStrides.end());

    auto reduce_1 = DeviceReduceInstance_1{};

    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1,
                                                       i_inStrides_1,
                                                       i_inLengths_2,
                                                       i_inStrides_2,
                                                       reduceDims_1,
                                                       1.0f,
                                                       0.0f,
                                                       in_1_dev.GetDeviceBuffer(),
                                                       nullptr,
                                                       in_2_dev.GetDeviceBuffer(),
                                                       nullptr,
                                                       in_elementwise_op,
                                                       PassThroughOp{});

    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
    {
        std::cout
            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
            << std::endl;
    };

    auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();

    auto reduce_2 = DeviceReduceInstance_2{};

    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2,
                                                       i_inStrides_2,
                                                       i_outLengths,
                                                       i_outStrides,
                                                       reduceDims_2,
                                                       alpha,
                                                       beta,
                                                       in_2_dev.GetDeviceBuffer(),
                                                       nullptr,
                                                       out_dev.GetDeviceBuffer(),
                                                       nullptr,
                                                       PassThroughOp{},
                                                       acc_elementwise_op);

    if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
    {
        std::cout
            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
            << std::endl;
    };

    auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();

    float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
    float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});

    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
                            invariant_total_length * sizeof(InOutDataType);

    float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);

    std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
              << reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;

    bool pass = true;

    if(do_verify)
    {
        out_dev.FromDevice(out.mData.data());
        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
    };

    return (pass ? 0 : 1);
}