diff --git a/example/ck_tile/99_toy_example/01_add/CMakeLists.txt b/example/ck_tile/99_toy_example/01_add/CMakeLists.txt new file mode 100644 index 0000000000..05bfd67253 --- /dev/null +++ b/example/ck_tile/99_toy_example/01_add/CMakeLists.txt @@ -0,0 +1,19 @@ +set(EXAMPLE_REDUCE "add") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding example ${EXAMPLE_REDUCE}") + +add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL add.cpp) +target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +set(EXAMPLE_REDUCE_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND EXAMPLE_REDUCE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTIONS}) + +# TODO: we have to turn off this global prop, otherwise the progress bar generated +# by cmake will print too many files, execvp: /bin/sh: Argument list too long +# however, this property may affect global +# TODO: consider codegen a makefile by us +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/99_toy_example/01_add/add.cpp b/example/ck_tile/99_toy_example/01_add/add.cpp new file mode 100644 index 0000000000..3646c83cff --- /dev/null +++ b/example/ck_tile/99_toy_example/01_add/add.cpp @@ -0,0 +1,112 @@ +#include "ck_tile/host.hpp" +#include "reference_add.hpp" +#include "add.hpp" +#include + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "10240", "m dimension") + .insert("n", "4096", "n dimension") + .insert("v", "1", "cpu validation or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + using XDataType = DataType; + using ComputeDataType = float; + using YDataType = DataType; + + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + ck_tile::HostTensor x_host_a({m, n}); + ck_tile::HostTensor x_host_b({m, n}); + + ck_tile::HostTensor y_host_ref({m, n}); + ck_tile::HostTensor y_host_dev({m, n}); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host_a); + ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host_b); + + ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); + + x_buf_a.ToDevice(x_host_a.data()); + x_buf_b.ToDevice(x_host_b.data()); + + // 27xx -> 32xx, 1D block to 2D block + using BlockWarps = ck_tile::sequence<1, 8>; + using BlockTile = ck_tile::sequence<1, 2048>; + using WarpTile = ck_tile::sequence<1, 256>; + using Vector = ck_tile::sequence<1, 4>; + + constexpr ck_tile::index_t kBlockSize = 512; + constexpr ck_tile::index_t kBlockPerCu = 1; + ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{})); + std::cout << "block x-size = " << BlockTile::at(ck_tile::number<0>{}) << std::endl; + std::cout << "grid size " << kGridSize << std::endl; + + using Shape = ck_tile::AddShape; + using Porblem = + ck_tile::AddProblem; + + using Kernel = ck_tile::Add; + + float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, + ck_tile::make_kernel( + Kernel{}, + kGridSize, + kBlockSize, + 0, + static_cast(x_buf_a.GetDeviceBuffer()), + static_cast(x_buf_b.GetDeviceBuffer()), + static_cast(y_buf.GetDeviceBuffer()), + m, + n)); + + std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m * n; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + + bool pass = true; + + if(do_validation) + { + ck_tile::reference_add( + x_host_a, x_host_b, y_host_ref); + y_buf.FromDevice(y_host_dev.mData.data()); + pass = ck_tile::check_err(y_host_dev, y_host_ref); + + std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } +} diff --git a/example/ck_tile/99_toy_example/01_add/add.hpp b/example/ck_tile/99_toy_example/01_add/add.hpp new file mode 100644 index 0000000000..71d2aca924 --- /dev/null +++ b/example/ck_tile/99_toy_example/01_add/add.hpp @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +template + typename BlockTile, // block size, seq + typename WarpTile, // warp size, seq + typename Vector> // contiguous pixels(vector size) along seq +struct AddShape +{ + static constexpr index_t Block_M = BlockTile::at(number<0>{}); + static constexpr index_t Block_N = BlockTile::at(number<1>{}); + + static constexpr index_t Warp_M = WarpTile::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile::at(number<1>{}); + + static constexpr index_t Vector_M = Vector::at(number<0>{}); + static constexpr index_t Vector_N = Vector::at(number<1>{}); + + static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); + + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + static constexpr index_t BlockSize = + warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); +}; + +template +struct AddProblem +{ + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; +}; + +struct AddDefaultPolicy +{ + template + CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() + { + using S = typename Problem::BlockShape; + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } +}; + +template +struct Add +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + + CK_TILE_DEVICE void operator()(const XDataType* p_x_a, const XDataType* p_x_b, YDataType* p_y, index_t M, index_t N) const + { + using S = typename Problem::BlockShape; + + const auto x_m_n_a = make_naive_tensor_view( + p_x_a, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); + + const auto x_m_n_b = make_naive_tensor_view( + p_x_b, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); + + const auto y_m_n = make_naive_tensor_view( + p_y, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); + + const auto iM = get_block_id() * S::Block_M; + + auto x_window_a = make_tile_window(x_m_n_a, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); + + auto x_window_b = make_tile_window(x_m_n_b, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); + + auto y_window = make_tile_window(y_m_n, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); + + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto xa = load_tile(x_window_a); + const auto xb = load_tile(x_window_b); + auto y_compute = load_tile(y_window); + + constexpr auto spans = decltype(xa)::get_distributed_spans(); + sweep_tile_span(spans[number<0>{}], [&](auto idx0) { + sweep_tile_span(spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = ck_tile::make_tuple(idx0, idx1); + const auto x = ck_tile::type_convert(xa[i_j_idx]); + const auto y = ck_tile::type_convert(xb[i_j_idx]); + y_compute(i_j_idx) = x + y; + }); + }); + + store_tile(y_window, cast_tile(y_compute)); + move_tile_window(x_window_a, {0, S::Block_N}); + move_tile_window(x_window_b, {0, S::Block_N}); + move_tile_window(y_window, {0, S::Block_N}); + } + } +}; + +} // namespace ck_tile diff --git a/example/ck_tile/99_toy_example/01_add/reference_add.hpp b/example/ck_tile/99_toy_example/01_add/reference_add.hpp new file mode 100644 index 0000000000..26a72286da --- /dev/null +++ b/example/ck_tile/99_toy_example/01_add/reference_add.hpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include + +namespace ck_tile { + +template +CK_TILE_HOST void +reference_add(const HostTensor& xa_m_n, const HostTensor& xb_m_n, HostTensor& y_m_n) +{ + auto f = [&](auto m) { + const int N = xa_m_n.mDesc.get_lengths()[1]; + + for(int n = 0; n < N; ++n) + { + y_m_n(m, n) = ck_tile::type_convert(xa_m_n(m, n)) + ck_tile::type_convert(xb_m_n(m, n)); + } + }; + + make_ParallelTensorFunctor(f, y_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); +} + +} // namespace ck_tile diff --git a/example/ck_tile/99_toy_example/CMakeLists.txt b/example/ck_tile/99_toy_example/CMakeLists.txt new file mode 100644 index 0000000000..80024d45e8 --- /dev/null +++ b/example/ck_tile/99_toy_example/CMakeLists.txt @@ -0,0 +1,5 @@ +include_directories(AFTER + ${CMAKE_CURRENT_LIST_DIR} +) + +add_subdirectory(01_add) diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 7f4ba2ed35..8a66732860 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -18,3 +18,4 @@ add_subdirectory(15_fused_moe) add_subdirectory(16_batched_gemm) add_subdirectory(17_grouped_gemm) add_subdirectory(35_batched_transpose) +add_subdirectory(99_toy_example)