Files
composable_kernel/example/12_reduce/reduce_blockwise_two_call.cpp
John Shumway ad57f6ef0b [CK_BUILDER] Put global CK functions in an the CK namespace (#3232)
* Wrap ck host utitlies in CK namespace.

The CK and CK-Tile source code bases are incompatible because CK is not properly using namespaces everywhere. In particular, we need to put hip_check_error in the ck namespace.

Move all functions in include/ck_/host_utility that were in global namespace into the ck namespace.

There may be additional namespace problems like this, and it's possible we'll have namespace clashes. But it is good design to properly guard our to code bases (CK and CKTile) so that they can both coexist. Moreover, estabilishing this compatiblity is essential if we are going to allow the builder to instantiate  kernels from either template library.

* Add using declarations to test code.

After moving some of the untils into the ck namespace, most examples and a few tests had to be updated to recognize the new namespace declarations. We add using declarations to individual compute units for functions that were previously in the global namespace.

* Add using declarations to client examples.
2025-11-19 11:23:02 +01:00

339 lines
14 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
using namespace ck;
using namespace ck::tensor_operation::device;
using InOutDataType = ck::half_t;
using InOutDataType = ck::half_t;
using AccDataType = float;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr bool PropagateNan = true;
constexpr bool OutputIndex = false;
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation =
typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation =
typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
using PassThroughOp = tensor_operation::element_wise::PassThrough;
using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
AccDataType,
InOutDataType,
5, // Rank
1, // NumReduceDim
ReduceOperation,
InElementwiseOperation,
PassThroughOp,
InMemoryDataOperationEnum::Set,
PropagateNan,
OutputIndex,
false, // HaveIndexInputIfOutputIndex
256,
32,
8,
1,
1,
1, // vector dim
1,
1>;
using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
AccDataType,
InOutDataType,
4, // Rank
1, // NumReduceDim
ReduceOperation,
PassThroughOp,
AccElementwiseOperation,
InMemoryDataOperationEnum::Set,
PropagateNan,
OutputIndex,
false, // HaveIndexInputIfOutputIndex
256,
128,
2,
1,
1,
1, // vector dim
1,
1>;
static bool do_verify;
static int init_method;
static float alpha;
static float beta;
static bool time_kernel;
int main(int argc, char* argv[])
{
// used by the device reduction
const std::array<int, 1> reduceDims_1 = {4};
// const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
const std::array<int, 1> reduceDims_2 = {3};
// const std::array<int, 3> invariantDims_2 = {0, 1, 2};
// used by the host reduction
const std::array<int, 2> reduceDims = {3, 4};
// const std::array<int, 3> invariantDims = {0, 1, 2};
std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
// input lengths of the second reduction, which is also the output lengths of the first
// reduction
std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
std::vector<size_t> outLengths = {64, 320, 80};
if(argc == 1)
{
do_verify = true;
init_method = 2;
time_kernel = true;
}
else if((argc == 4) || (argc == 9))
{
do_verify = static_cast<bool>(argv[1]);
init_method = atoi(argv[2]);
time_kernel = static_cast<bool>(atoi(argv[3]));
if(argc == 9)
{
inLengths_1[0] = atoi(argv[4]);
inLengths_1[1] = atoi(argv[5]);
inLengths_1[2] = atoi(argv[6]);
inLengths_1[3] = atoi(argv[7]);
inLengths_1[4] = atoi(argv[8]);
inLengths_2[0] = inLengths_1[0];
inLengths_2[1] = inLengths_1[1];
inLengths_2[2] = inLengths_1[2];
inLengths_2[3] = inLengths_1[3];
outLengths[0] = inLengths_1[0];
outLengths[1] = inLengths_1[1];
outLengths[2] = inLengths_1[2];
}
}
else
{
std::ostringstream ostr;
ostr << "Wrong parameter! " << std::endl
<< "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;
throw std::runtime_error(ostr.str());
};
alpha = 1.0f;
beta = 0.0f;
Tensor<InOutDataType> in_1(inLengths_1);
Tensor<InOutDataType> out_ref(outLengths);
Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
Tensor<InOutDataType> out(outLengths);
auto inStrides_1 = in_1.mDesc.GetStrides();
auto inStrides_2 = in_2.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in_1.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
if(do_verify)
{
switch(init_method)
{
case 0: break;
case 1:
in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
break;
case 2:
in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
break;
default:
in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
num_thread);
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i];
};
DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize());
DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
in_1_dev.ToDevice(in_1.mData.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op;
std::tie(in_elementwise_op, acc_elementwise_op) =
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length));
std::array<index_t, 5> arrInLengths_1;
std::array<index_t, 5> arrInStrides_1;
std::array<index_t, 4> arrInLengths_2;
std::array<index_t, 4> arrInStrides_2;
std::array<index_t, 3> arrOutLengths;
std::array<index_t, 3> arrOutStrides;
ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
ck::ranges::copy(outLengths, arrOutLengths.begin());
ck::ranges::copy(outStrides, arrOutStrides.begin());
if(do_verify)
{
using ReferenceReduceInstance =
ck::tensor_operation::host::ReferenceReduce<InOutDataType,
AccDataType,
InOutDataType,
5,
2,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
OutputIndex>;
auto reduce_ref = ReferenceReduceInstance{};
auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
arrInStrides_1,
arrOutLengths,
arrOutStrides,
reduceDims,
static_cast<double>(alpha),
static_cast<double>(beta),
in_1.mData.data(),
nullptr,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
<< std::endl;
return (false);
};
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
invoker_ptr_ref->Run(argument_ptr_ref.get());
};
auto reduce_1 = DeviceReduceInstance_1{};
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
arrInStrides_1,
arrInLengths_2,
arrInStrides_2,
reduceDims_1,
1.0,
0.0,
in_1_dev.GetDeviceBuffer(),
nullptr,
in_2_dev.GetDeviceBuffer(),
nullptr,
in_elementwise_op,
PassThroughOp{});
if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
{
std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
<< std::endl;
};
auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
auto reduce_2 = DeviceReduceInstance_2{};
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
arrInStrides_2,
arrOutLengths,
arrOutStrides,
reduceDims_2,
static_cast<double>(alpha),
static_cast<double>(beta),
in_2_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
nullptr,
PassThroughOp{},
acc_elementwise_op);
if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
{
std::cout
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<< std::endl;
};
auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
invariant_total_length * sizeof(InOutDataType);
float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);
std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
<< reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;
bool pass = true;
if(do_verify)
{
out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out, out_ref);
};
return (pass ? 0 : 1);
}