mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
* port layernorm
* change warp_welford.hpp
* Update warpshuffle
* 1. Add save mean and save std back
2. Move construction of tensor_view and tile_window to operator()
* refine welford max count calculation
* unify layernorm api
* Rename file
* Remove save mean and inv std
* Revert "refine welford max count calculation"
This reverts commit 022365802b.
* Fix order of parameter
* refine welford max count calculation again
* Remove fp32 instances
* Fix bug of padding
* refactor api
* Support bf16
* Extract common function
* Refine arg of operator()
* Add kMThreadPerBlock to template parameter
* clang format
* Refine variable name
* Refine file name
* remove redundant line
* refactor layernorm2d pipeline and add block-per-block utility
* fix name
* rename more
* add more block-per-tile instance
* remove duplicated define
* update instance for 2048, 1024 case
* support up to 2048 now
* opt loading
* add n1536
* Add two pass pipeline
* format
* Fix incorrect type
* parallel compilation
* Use smaller N
* fix 2p pass
* Support Repeat_M in distribution
* Refine nameing
* Add reduce example
---------
Co-authored-by: letaoqin <letaoqin@amd.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: rocking <ChunYu.Lai@amd.com>
Co-authored-by: carlushuang <carlus.huang@amd.com>
111 lines
3.7 KiB
C++
111 lines
3.7 KiB
C++
#include "ck_tile/host.hpp"
|
|
#include "reduce.hpp"
|
|
#include <cstring>
|
|
|
|
auto create_args(int argc, char* argv[])
|
|
{
|
|
ck_tile::ArgParser arg_parser;
|
|
arg_parser.insert("m", "3328", "m dimension")
|
|
.insert("n", "4096", "n dimension")
|
|
.insert("v", "1", "cpu validation or not")
|
|
.insert("prec", "fp16", "precision")
|
|
.insert("warmup", "5", "cold iter")
|
|
.insert("repeat", "20", "hot iter");
|
|
|
|
bool result = arg_parser.parse(argc, argv);
|
|
return std::make_tuple(result, arg_parser);
|
|
}
|
|
|
|
template <typename DataType>
|
|
bool run(const ck_tile::ArgParser& arg_parser)
|
|
{
|
|
using ADataType = DataType;
|
|
using AccDataType = float;
|
|
using BDataType = DataType;
|
|
|
|
ck_tile::index_t m = arg_parser.get_int("m");
|
|
ck_tile::index_t n = arg_parser.get_int("n");
|
|
int do_validation = arg_parser.get_int("v");
|
|
int warmup = arg_parser.get_int("warmup");
|
|
int repeat = arg_parser.get_int("repeat");
|
|
|
|
ck_tile::HostTensor<ADataType> a_host({m, n});
|
|
ck_tile::HostTensor<BDataType> b_host_ref({m});
|
|
ck_tile::HostTensor<BDataType> b_host_dev({m});
|
|
|
|
ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
|
|
|
|
ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
|
|
ck_tile::DeviceMem b_buf(b_host_dev.get_element_space_size_in_bytes());
|
|
|
|
a_buf.ToDevice(a_host.data());
|
|
|
|
using BlockWarps = ck_tile::sequence<4, 1>;
|
|
using BlockTile = ck_tile::sequence<128, 128>;
|
|
using WarpTile = ck_tile::sequence<32, 128>;
|
|
using ThreadTile = ck_tile::sequence<8, 8>;
|
|
|
|
constexpr ck_tile::index_t kBlockSize = 256;
|
|
constexpr ck_tile::index_t kBlockPerCu = 1;
|
|
ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
|
|
std::cout << "grid size " << kGridSize << std::endl;
|
|
|
|
using Kernel = ck_tile::Reduce<ADataType,
|
|
AccDataType,
|
|
BDataType,
|
|
kBlockSize,
|
|
BlockWarps,
|
|
BlockTile,
|
|
WarpTile,
|
|
ThreadTile>;
|
|
|
|
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
|
|
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
|
|
Kernel{},
|
|
kGridSize,
|
|
kBlockSize,
|
|
0,
|
|
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
|
|
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
|
|
m,
|
|
n));
|
|
|
|
std::size_t num_btype = sizeof(ADataType) * m * n + sizeof(BDataType) * m;
|
|
|
|
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
|
|
|
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
|
|
|
bool pass = true;
|
|
|
|
if(do_validation)
|
|
{
|
|
// reference
|
|
ck_tile::reference_reduce<ADataType, AccDataType, BDataType>(a_host, b_host_ref);
|
|
b_buf.FromDevice(b_host_dev.mData.data());
|
|
pass = ck_tile::check_err(b_host_dev, b_host_ref);
|
|
|
|
std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
|
|
}
|
|
|
|
return pass;
|
|
}
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
auto [result, arg_parser] = create_args(argc, argv);
|
|
if(!result)
|
|
return -1;
|
|
|
|
const std::string data_type = arg_parser.get_str("prec");
|
|
|
|
if(data_type == "fp16")
|
|
{
|
|
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
|
|
}
|
|
if(data_type == "bf16")
|
|
{
|
|
return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
|
|
}
|
|
}
|