mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-22 16:17:37 +00:00
Fix the Composable Kernel CI and versions incompatibility (#4640) ## Motivation This PR has 4 patches: 1. Fix the CI error of grouped gemm. 2. Fix the incompatibility of old linux version. 3. Fix the potential errors of flatmm. 4. Address the previous comments of abquant eight warps pipeline solution.
339 lines
14 KiB
C++
339 lines
14 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#include <iostream>
|
|
#include <numeric>
|
|
#include <sstream>
|
|
#include <initializer_list>
|
|
#include <cstdlib>
|
|
#include <getopt.h>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/utility/reduction_enums.hpp"
|
|
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
|
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
|
|
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
|
|
|
|
#include "ck/library/utility/check_err.hpp"
|
|
#include "ck/library/utility/device_memory.hpp"
|
|
#include "ck/library/utility/host_tensor.hpp"
|
|
#include "ck/library/utility/host_tensor_generator.hpp"
|
|
#include "ck/library/utility/host_common_util.hpp"
|
|
|
|
using ::ck::DeviceMem;
|
|
using ::ck::HostTensorDescriptor;
|
|
using ::ck::Tensor;
|
|
|
|
using namespace ck;
|
|
using namespace ck::tensor_operation::device;
|
|
|
|
using InOutDataType = ck::half_t;
|
|
using InOutDataType = ck::half_t;
|
|
using AccDataType = float;
|
|
|
|
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
|
|
constexpr bool PropagateNan = true;
|
|
constexpr bool OutputIndex = false;
|
|
|
|
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
|
|
using InElementwiseOperation =
|
|
typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
|
|
using AccElementwiseOperation =
|
|
typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
|
|
|
|
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
|
|
|
using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
|
|
AccDataType,
|
|
InOutDataType,
|
|
5, // Rank
|
|
1, // NumReduceDim
|
|
ReduceOperation,
|
|
InElementwiseOperation,
|
|
PassThroughOp,
|
|
InMemoryDataOperationEnum::Set,
|
|
PropagateNan,
|
|
OutputIndex,
|
|
false, // HaveIndexInputIfOutputIndex
|
|
256,
|
|
32,
|
|
8,
|
|
1,
|
|
1,
|
|
1, // vector dim
|
|
1,
|
|
1>;
|
|
|
|
using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
|
|
AccDataType,
|
|
InOutDataType,
|
|
4, // Rank
|
|
1, // NumReduceDim
|
|
ReduceOperation,
|
|
PassThroughOp,
|
|
AccElementwiseOperation,
|
|
InMemoryDataOperationEnum::Set,
|
|
PropagateNan,
|
|
OutputIndex,
|
|
false, // HaveIndexInputIfOutputIndex
|
|
256,
|
|
128,
|
|
2,
|
|
1,
|
|
1,
|
|
1, // vector dim
|
|
1,
|
|
1>;
|
|
|
|
static bool do_verify;
|
|
static int init_method;
|
|
static float alpha;
|
|
static float beta_;
|
|
static bool time_kernel;
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
// used by the device reduction
|
|
const std::array<int, 1> reduceDims_1 = {4};
|
|
// const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
|
|
|
|
const std::array<int, 1> reduceDims_2 = {3};
|
|
// const std::array<int, 3> invariantDims_2 = {0, 1, 2};
|
|
|
|
// used by the host reduction
|
|
const std::array<int, 2> reduceDims = {3, 4};
|
|
// const std::array<int, 3> invariantDims = {0, 1, 2};
|
|
|
|
std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
|
|
|
|
// input lengths of the second reduction, which is also the output lengths of the first
|
|
// reduction
|
|
std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
|
|
|
|
std::vector<size_t> outLengths = {64, 320, 80};
|
|
|
|
if(argc == 1)
|
|
{
|
|
do_verify = true;
|
|
init_method = 2;
|
|
time_kernel = true;
|
|
}
|
|
else if((argc == 4) || (argc == 9))
|
|
{
|
|
do_verify = static_cast<bool>(argv[1]);
|
|
init_method = atoi(argv[2]);
|
|
time_kernel = static_cast<bool>(atoi(argv[3]));
|
|
if(argc == 9)
|
|
{
|
|
inLengths_1[0] = atoi(argv[4]);
|
|
inLengths_1[1] = atoi(argv[5]);
|
|
inLengths_1[2] = atoi(argv[6]);
|
|
inLengths_1[3] = atoi(argv[7]);
|
|
inLengths_1[4] = atoi(argv[8]);
|
|
inLengths_2[0] = inLengths_1[0];
|
|
inLengths_2[1] = inLengths_1[1];
|
|
inLengths_2[2] = inLengths_1[2];
|
|
inLengths_2[3] = inLengths_1[3];
|
|
outLengths[0] = inLengths_1[0];
|
|
outLengths[1] = inLengths_1[1];
|
|
outLengths[2] = inLengths_1[2];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
std::ostringstream ostr;
|
|
|
|
ostr << "Wrong parameter! " << std::endl
|
|
<< "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;
|
|
|
|
throw std::runtime_error(ostr.str());
|
|
};
|
|
|
|
alpha = 1.0f;
|
|
beta_ = 0.0f;
|
|
|
|
Tensor<InOutDataType> in_1(inLengths_1);
|
|
|
|
Tensor<InOutDataType> out_ref(outLengths);
|
|
Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
|
|
Tensor<InOutDataType> out(outLengths);
|
|
|
|
auto inStrides_1 = in_1.mDesc.GetStrides();
|
|
auto inStrides_2 = in_2.mDesc.GetStrides();
|
|
auto outStrides = out.mDesc.GetStrides();
|
|
|
|
size_t invariant_total_length = out.mDesc.GetElementSize();
|
|
size_t reduce_total_length = in_1.mDesc.GetElementSize() / invariant_total_length;
|
|
|
|
std::size_t num_thread = 1;
|
|
|
|
if(do_verify)
|
|
{
|
|
switch(init_method)
|
|
{
|
|
case 0: break;
|
|
case 1:
|
|
in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
|
if(beta_ != 0.0f)
|
|
out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
|
break;
|
|
case 2:
|
|
in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
|
if(beta_ != 0.0f)
|
|
out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
|
break;
|
|
default:
|
|
in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
|
|
if(beta_ != 0.0f)
|
|
out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
|
|
num_thread);
|
|
}
|
|
|
|
if(beta_ != 0.0f)
|
|
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
|
|
out.mData[i] = out_ref.mData[i];
|
|
};
|
|
|
|
DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize());
|
|
DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize());
|
|
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
|
|
|
|
in_1_dev.ToDevice(in_1.mData.data());
|
|
|
|
if(beta_ != 0.0f)
|
|
out_dev.ToDevice(out.mData.data());
|
|
|
|
InElementwiseOperation in_elementwise_op;
|
|
AccElementwiseOperation acc_elementwise_op;
|
|
|
|
std::tie(in_elementwise_op, acc_elementwise_op) =
|
|
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
|
|
static_cast<int32_t>(reduce_total_length));
|
|
|
|
std::array<index_t, 5> arrInLengths_1;
|
|
std::array<index_t, 5> arrInStrides_1;
|
|
std::array<index_t, 4> arrInLengths_2;
|
|
std::array<index_t, 4> arrInStrides_2;
|
|
std::array<index_t, 3> arrOutLengths;
|
|
std::array<index_t, 3> arrOutStrides;
|
|
|
|
ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
|
|
ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
|
|
ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
|
|
ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
|
|
ck::ranges::copy(outLengths, arrOutLengths.begin());
|
|
ck::ranges::copy(outStrides, arrOutStrides.begin());
|
|
|
|
if(do_verify)
|
|
{
|
|
using ReferenceReduceInstance =
|
|
ck::tensor_operation::host::ReferenceReduce<InOutDataType,
|
|
AccDataType,
|
|
InOutDataType,
|
|
5,
|
|
2,
|
|
ReduceOperation,
|
|
InElementwiseOperation,
|
|
AccElementwiseOperation,
|
|
PropagateNan,
|
|
OutputIndex>;
|
|
|
|
auto reduce_ref = ReferenceReduceInstance{};
|
|
|
|
auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
|
|
arrInStrides_1,
|
|
arrOutLengths,
|
|
arrOutStrides,
|
|
reduceDims,
|
|
static_cast<double>(alpha),
|
|
static_cast<double>(beta_),
|
|
in_1.mData.data(),
|
|
nullptr,
|
|
out_ref.mData.data(),
|
|
nullptr,
|
|
in_elementwise_op,
|
|
acc_elementwise_op);
|
|
|
|
if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
|
{
|
|
std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
|
|
<< std::endl;
|
|
return (false);
|
|
};
|
|
|
|
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
|
|
|
|
invoker_ptr_ref->Run(argument_ptr_ref.get());
|
|
};
|
|
|
|
auto reduce_1 = DeviceReduceInstance_1{};
|
|
|
|
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
|
|
arrInStrides_1,
|
|
arrInLengths_2,
|
|
arrInStrides_2,
|
|
reduceDims_1,
|
|
1.0,
|
|
0.0,
|
|
in_1_dev.GetDeviceBuffer(),
|
|
nullptr,
|
|
in_2_dev.GetDeviceBuffer(),
|
|
nullptr,
|
|
in_elementwise_op,
|
|
PassThroughOp{});
|
|
|
|
if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
|
|
{
|
|
std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
|
|
<< std::endl;
|
|
};
|
|
|
|
auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
|
|
|
|
auto reduce_2 = DeviceReduceInstance_2{};
|
|
|
|
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
|
|
arrInStrides_2,
|
|
arrOutLengths,
|
|
arrOutStrides,
|
|
reduceDims_2,
|
|
static_cast<double>(alpha),
|
|
static_cast<double>(beta_),
|
|
in_2_dev.GetDeviceBuffer(),
|
|
nullptr,
|
|
out_dev.GetDeviceBuffer(),
|
|
nullptr,
|
|
PassThroughOp{},
|
|
acc_elementwise_op);
|
|
|
|
if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
|
|
{
|
|
std::cout
|
|
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
|
|
<< std::endl;
|
|
};
|
|
|
|
auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
|
|
|
|
float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
|
|
float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});
|
|
|
|
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
|
|
invariant_total_length * sizeof(InOutDataType);
|
|
|
|
float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);
|
|
|
|
std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
|
|
<< reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;
|
|
|
|
bool pass = true;
|
|
|
|
if(do_verify)
|
|
{
|
|
out_dev.FromDevice(out.mData.data());
|
|
pass = pass && ck::utils::check_err(out, out_ref);
|
|
};
|
|
|
|
return (pass ? 0 : 1);
|
|
}
|