host verification

This commit is contained in:
rocking
2024-10-25 23:52:01 +00:00
parent 826ee18a11
commit 27d96b4031
4 changed files with 196 additions and 34 deletions

View File

@@ -4,6 +4,32 @@
#include "ck_tile/ops/add_rmsnorm2d_rdquant.hpp"
#include <cstring>
// different threshold for different dtype
template <typename DataType>
auto get_elimit()
{
double rtol = 1e-2;
double atol = 1e-2;
return ck_tile::make_tuple(rtol, atol);
}
template <>
auto get_elimit<ck_tile::bf16_t>()
{
double rtol = 1e-2;
double atol = 1e-2;
return ck_tile::make_tuple(rtol, atol);
}
template <>
auto get_elimit<ck_tile::int8_t>()
{
// due to rounding, int8 quantization might have 1 abs error
double rtol = 1;
double atol = 1;
return ck_tile::make_tuple(rtol, atol);
}
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
@@ -41,12 +67,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
using GammaDataType = DataType;
using XDataType = DataType;
using YScaleDataType = DataType;
using QYDataType = int8_t;
using QYDataType = ck_tile::int8_t;
using ComputeDataType = float;
// host verify
ck_tile::HostTensor<XDataType> a_host({m, n}, {stride, 1});
ck_tile::HostTensor<XDataType> b_host({m, n}, {stride, 1});
ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
ck_tile::HostTensor<BDataType> b_host({m, n}, {stride, 1});
ck_tile::HostTensor<GammaDataType> gamma_host({n});
ck_tile::HostTensor<XDataType> x_host_ref({m, n}, {stride, 1});
@@ -121,41 +147,95 @@ bool run(const ck_tile::ArgParser& arg_parser)
if(do_validation)
{
using YDataType = ComputeDataType;
using InvRmsDataType = DataType;
ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
// ck_tile::reference_rmsnorm2d_fwd<XDataType,
// GammaDataType,
// ComputeDataType,
// QYDataType,
// InvRmsDataType>(
// x_host, gamma_host, qy_host_ref, invRms_host_ref, epsilon);
// Add
{
auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
ck_tile::reference_binary_elementwise<ADataType, BDataType, XDataType, ComputeDataType>(
a_host, b_host, x_host_ref, op);
// qy_buf.FromDevice(qy_host_dev.data());
x_buf.FromDevice(x_host_dev.data());
// auto [rtol, atol] = ck_tile::make_tuple(1e-3, 1e-3);
// if(stride == n)
// {
// pass = ck_tile::check_err(
// qy_host_dev, qy_host_ref, std::string("OUT Error: Incorrect results!"), rtol,
// atol);
// }
// else
// {
// for(int i_r = 0; i_r < m; i_r++)
// {
// std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
// qy_host_dev.begin() + i_r * stride + n);
// std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
// qy_host_ref.begin() + i_r * stride + n);
// pass &= ck_tile::check_err(qy_host_dev_row,
// qy_host_ref_row,
// std::string("OUT[") + std::to_string(i_r) +
// std::string("] Error: Incorrect results!"),
// rtol,
// atol);
// }
// }
auto [rtol, atol] = get_elimit<XDataType>();
pass = ck_tile::check_err(
x_host_dev, x_host_ref, std::string("x Error: Incorrect results!"), rtol, atol);
}
ck_tile::HostTensor<YDataType> y_host({m, n});
// RmsNorm2d
{
ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
// CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
// simplicity
ck_tile::reference_rmsnorm2d_fwd<XDataType,
GammaDataType,
ComputeDataType,
YDataType,
InvRmsDataType>(
x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
}
// yscale
{
ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
using ReduceAmax = ck_tile::ReduceOp::AbsMax;
ck_tile::reference_reduce<YDataType, ComputeDataType, YDataType>(
y_host, y_rowwise_amax_host, ReduceAmax{});
auto op = [](const auto& v0) {
return v0 /
ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
};
ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
y_rowwise_amax_host, yscale_host_ref, op);
yscale_buf.FromDevice(yscale_host_dev.mData.data());
auto [rtol, atol] = get_elimit<YScaleDataType>();
pass &= ck_tile::check_err(yscale_host_dev,
yscale_host_ref,
std::string("yscale Error: Incorrect results!"),
rtol,
atol);
}
// rowwise quantization
{
ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
y_host, yscale_host_ref, qy_host_ref);
qy_buf.FromDevice(qy_host_dev.data());
auto [rtol, atol] = get_elimit<QYDataType>();
if(stride == n)
{
pass = ck_tile::check_err(qy_host_dev,
qy_host_ref,
std::string("qy Error: Incorrect results!"),
rtol,
atol);
}
else
{
for(int i_r = 0; i_r < m; i_r++)
{
std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
qy_host_dev.begin() + i_r * stride + n);
std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
qy_host_ref.begin() + i_r * stride + n);
pass &= ck_tile::check_err(qy_host_dev_row,
qy_host_ref_row,
std::string("qy[") + std::to_string(i_r) +
std::string("] Error: Incorrect results!"),
rtol,
atol);
}
}
}
std::cout << "[" << data_type << "]"
<< " m:" << m << ", n:" << n << ", stride:" << stride

View File

@@ -19,11 +19,13 @@
#include "ck_tile/host/reference/reference_batched_masking.hpp"
#include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp"
#include "ck_tile/host/reference/reference_batched_softmax.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"
#include "ck_tile/host/reference/reference_gemm.hpp"
#include "ck_tile/host/reference/reference_im2col.hpp"
#include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
#include "ck_tile/host/reference/reference_reduce.hpp"
#include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp"
#include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
#include "ck_tile/host/reference/reference_softmax.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/timer.hpp"

View File

@@ -0,0 +1,47 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
HostTensor<BDataType>& b,
ElementOp element_op)
{
// TODO: imeplement gpu version reference function
auto f = [&](auto i) {
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
auto v_b = element_op(v_a);
b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
};
make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename CDataType,
typename ComputeDataType,
typename ElementOp>
CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
const HostTensor<BDataType>& b,
HostTensor<CDataType>& c,
ElementOp element_op)
{
// TODO: imeplement gpu version reference function
auto f = [&](auto i) {
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
auto v_b = type_convert<ComputeDataType>(b.mData[i]);
auto v_c = element_op(v_a, v_b);
c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
};
make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,33 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename XDataType, typename ScaleDataType, typename QXDataType>
CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& x_m_n,
const HostTensor<ScaleDataType>& scale_m,
HostTensor<QXDataType>& qx_m_n)
{
auto f = [&](auto m) {
const int N = x_m_n.mDesc.get_lengths()[1];
for(int n = 0; n < N; ++n)
{
auto v_x = x_m_n(m, n);
// scale = amax / 127 for int8
auto v_scale = type_convert<XDataType>(scale_m(m));
auto v_qx = v_x / v_scale;
qx_m_n(m, n) = saturates<QXDataType>{}(v_qx);
}
};
make_ParallelTensorFunctor(f,
scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
} // namespace ck_tile