[Ck tile] support rmsnorm and related fusion (#1605)

* Add reduce2d new api

* Prevent user use cross warp reduction

* Fix bug of std caculation

* Add rmsnorm2d

* Add rmsnorm small example

* Remove static assert to prevent compile fail

* Add script to test performance and correctness

* Add missing cmake change

* refine naming

* refine example of rmsnorm

* Fix bug of rmsnorm

* Refine naming

* Fix cmake

* clang format

* Refine pipeline name

* Add add_rmsnorm2d_rdquant kernel

* Add reduce op

* host verification

* Fix bug of one pass pipeline

* Refine tile size

* Add two pass pipeline

* Rename two pass to three pass

* Fix bug of kSaveX == false

* Add instance library

* Add test script

* Fix bug of x verification

* Add save_x to trait

* Add README

* Move reduce2d into reduce folder

* Fix bug of welford when number of m warp > 1

* remove reduncant comment

* 1. move 06_rmsnorm2d to 10_rmsnorm2d
2. move 07_add_rmsnorm2d_rdquant to 11_add_rmsnorm2d_rdquant

* clang format and add missing header

* Add host validation of add + layernorm2d + rsquant

* Revert "Add host validation of add + layernorm2d + rsquant"

This reverts commit 936cb45797.

* Remove deprecated flag
This commit is contained in:
rocking
2024-10-30 15:22:56 +08:00
committed by GitHub
parent 8632221814
commit 3d60953477
90 changed files with 4674 additions and 128 deletions

View File

@@ -19,9 +19,9 @@ auto create_args(int argc, char* argv[])
template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
using ADataType = DataType;
using AccDataType = float;
using BDataType = DataType;
using XDataType = DataType;
using ComputeDataType = float;
using YDataType = DataType;
ck_tile::index_t m = arg_parser.get_int("m");
ck_tile::index_t n = arg_parser.get_int("n");
@@ -29,35 +29,39 @@ bool run(const ck_tile::ArgParser& arg_parser)
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
ck_tile::HostTensor<ADataType> a_host({m, n});
ck_tile::HostTensor<BDataType> b_host_ref({m});
ck_tile::HostTensor<BDataType> b_host_dev({m});
ck_tile::HostTensor<XDataType> x_host({m, n});
ck_tile::HostTensor<YDataType> y_host_ref({m});
ck_tile::HostTensor<YDataType> y_host_dev({m});
ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem b_buf(b_host_dev.get_element_space_size_in_bytes());
ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
a_buf.ToDevice(a_host.data());
x_buf.ToDevice(x_host.data());
using ReduceOp = ck_tile::ReduceOp::Add;
using BlockWarps = ck_tile::sequence<4, 1>;
using BlockTile = ck_tile::sequence<128, 128>;
using WarpTile = ck_tile::sequence<32, 128>;
using ThreadTile = ck_tile::sequence<8, 8>;
using Vector = ck_tile::sequence<8, 8>;
constexpr ck_tile::index_t kBlockSize = 256;
// cross warp-reduce
// using BlockWarps = ck_tile::sequence<2, 2>;
// using BlockTile = ck_tile::sequence<2, 1024>;
// using WarpTile = ck_tile::sequence<1, 512>;
// using Vector = ck_tile::sequence<1, 8>;
constexpr ck_tile::index_t kBlockSize = 512;
constexpr ck_tile::index_t kBlockPerCu = 1;
ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
std::cout << "grid size " << kGridSize << std::endl;
using Kernel = ck_tile::Reduce<ADataType,
AccDataType,
BDataType,
kBlockSize,
BlockWarps,
BlockTile,
WarpTile,
ThreadTile>;
using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
using Porblem =
ck_tile::Reduce2dProblem<XDataType, ComputeDataType, YDataType, Shape, ReduceOp>;
using Kernel = ck_tile::Reduce<Porblem>;
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
@@ -65,12 +69,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
kGridSize,
kBlockSize,
0,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
m,
n));
std::size_t num_btype = sizeof(ADataType) * m * n + sizeof(BDataType) * m;
std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
float gb_per_sec = num_btype / 1.E6 / ave_time;
@@ -81,9 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
if(do_validation)
{
// reference
ck_tile::reference_reduce<ADataType, AccDataType, BDataType>(a_host, b_host_ref);
b_buf.FromDevice(b_host_dev.mData.data());
pass = ck_tile::check_err(b_host_dev, b_host_ref);
ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
x_host, y_host_ref, ReduceOp{});
y_buf.FromDevice(y_host_dev.mData.data());
pass = ck_tile::check_err(y_host_dev, y_host_ref);
std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
}
@@ -103,8 +108,8 @@ int main(int argc, char* argv[])
{
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
}
if(data_type == "bf16")
{
return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
}
// else if(data_type == "bf16")
// {
// return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
// }
}