From 40058afae6d98cf740f9bb86432ca3beefce439d Mon Sep 17 00:00:00 2001 From: AviralGoelAMD Date: Tue, 15 Apr 2025 14:35:19 +0000 Subject: [PATCH] added explanation comments --- .../00_vector_add/vector_add.cpp | 93 ++++++++++++------- .../00_vector_add/vector_add.hpp | 58 +++++++----- 2 files changed, 95 insertions(+), 56 deletions(-) diff --git a/example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp b/example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp index ab51a5393b..9c80f731d8 100644 --- a/example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp +++ b/example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp @@ -3,6 +3,19 @@ #include "vector_add.hpp" #include + +// This example demonstrates how to use the ck_tile library to perform an elementwise vector addition +// using a custom kernel. The kernel is defined in the vector_add.hpp file, and the reference implementation +// is provided in the reference_vector_add.hpp file. + + + +// parse command line arguments +// -m: size of the vectors +// -v: validation flag (1 for validation, 0 for no validation) +// -prec: precision of the data type (fp16, fp32, int8, int32) +// -warmup: number of warmup iterations (number of kernel launches before measuring performance) +// -repeat: number of repeat iterations (number of kernel launches to measure performance) auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; @@ -19,45 +32,48 @@ auto create_args(int argc, char* argv[]) template bool run(const ck_tile::ArgParser& arg_parser) { - using XDataType = DataType; - using ComputeDataType = float; - using YDataType = DataType; + using XDataType = DataType; // input data type + using ComputeDataType = float; // compute data type + using YDataType = DataType; // output data type - ck_tile::index_t m = arg_parser.get_int("m"); - int do_validation = arg_parser.get_int("v"); + ck_tile::index_t m = arg_parser.get_int("m"); // size of the vectors + int do_validation = arg_parser.get_int("v"); // do we verify the result on cpu int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - ck_tile::HostTensor x_host_a({m}); - ck_tile::HostTensor x_host_b({m}); + ck_tile::HostTensor x_host_a({m}); // length input vector A, if given two arguments m, n the HostTensor will be created with shape (m, n) + ck_tile::HostTensor x_host_b({m}); // length input vector B, if given two arguments m, n the HostTensor will be created with shape (m, n) ck_tile::HostTensor y_host_ref({m}); ck_tile::HostTensor y_host_dev({m}); - ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host_a); - ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host_b); + ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host_a); // fill the input vector A with random values + ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host_b); - ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes()); // allocate device memory for input vector A (this a wrapper over hipMalloc) ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes()); ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); - x_buf_a.ToDevice(x_host_a.data()); + x_buf_a.ToDevice(x_host_a.data()); // copy the input vector A to device memory, this is a wrapper over hipMemcpy x_buf_b.ToDevice(x_host_b.data()); - // using BlockTile = ck_tile::sequence<8192>; - // using BlockWarps = ck_tile::sequence<4>; - // using WarpTile = ck_tile::sequence<512>; // 8 * 64 = 512 - // using Vector = ck_tile::sequence<8>; // 8 * 16 = 128 bytes + // Dividing the problem into blocktile, warptile, and vector + // The blocktile is the size of the tile that will be processed by a single block + // The warptile is the size of the tile that will be processed by a single warp + // The vector is the size of the tile that will be processed by a single thread + // The problem is divided into blocks of size BlockTile, each block is further divided into warps of size WarpTile + // and each warp is further divided into threads of size Vector + using BlockTile = ck_tile::sequence<8192>; // Size of the block tile (Entire problem is divided into blocks of this size) + using BlockWarps = ck_tile::sequence<8>; // How many concurrent warps are in a block (Each warp will cover some part of blockTile) + using WarpTile = ck_tile::sequence<64>; // How many elements are covered by a warp + using Vector = ck_tile::sequence<1>; // How many elements are covered by a thread (Each thread will cover some part of WarpTile) - // constexpr ck_tile::index_t kBlockSize = 256; - // constexpr ck_tile::index_t kBlockPerCu = 1; - - - using BlockTile = ck_tile::sequence<8192>; - using BlockWarps = ck_tile::sequence<8>; - using WarpTile = ck_tile::sequence<64>; - using Vector = ck_tile::sequence<1>; + // Interpretation of above configurations + // Each thread will cover 1 element (Vector) + // Each WarpTile will cover 64 elements (WarpTile) --> since 64 threads in a warp + // if we have 8 warps in a block (BlockWarps) then we have 8 * 64 = 512 threads in a block + // if 8 warps are not enough to cover the entire blockTile then each of the 8 concurrent warps will iterate over the blockTile several times constexpr ck_tile::index_t kBlockSize = 512; constexpr ck_tile::index_t kBlockPerCu = 1; @@ -76,24 +92,33 @@ bool run(const ck_tile::ArgParser& arg_parser) std::cout << "BlockWarps: " << BlockWarps::at(ck_tile::number<0>{}) << std::endl; std::cout << "WarpTile: " << WarpTile::at(ck_tile::number<0>{}) << std::endl; std::cout << "Vector: " << Vector::at(ck_tile::number<0>{}) << std::endl; - std::cout << "Repeat: " << Shape::Repeat_M << std::endl; + std::cout << "Repeat: " << Shape::Repeat_M << std::endl; // number of times a warp will iterate over the blockTile, covering different parts of the blockTile std::cout << "Threads per Block: " << kBlockSize << std::endl; std::cout << "ThreadBlocks per CU: " << kBlockPerCu << std::endl; + + // What is a Problem in CKTile? + // A Problem defines the shape of the data, the precision of the data using Problem = ck_tile::MultiplyVectorProblem; + + // What is a Policy in CKTile? + // A Policy defines how to map the data between threads and data in memory + // The kernel is the function that will be executed on the device + // It requires a Problem and Policy to be defined using Kernel = ck_tile::MultiplyVectorKernel; - float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, - ck_tile::make_kernel( - Kernel{}, - kGridSize, - kBlockSize, - 0, - static_cast(x_buf_a.GetDeviceBuffer()), - static_cast(x_buf_b.GetDeviceBuffer()), - static_cast(y_buf.GetDeviceBuffer()), - m)); + // The kernel is launched with the following parameters: + float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, // wrapper over hipStreamCreate + ck_tile::make_kernel( // numOfThreadsPerBlock, numOfBlocksPerCU + Kernel{}, // kernel + kGridSize, // number of blocks in the grid + kBlockSize, // number of threads in a block + 0, // shared memory size + static_cast(x_buf_a.GetDeviceBuffer()), // input vector A + static_cast(x_buf_b.GetDeviceBuffer()), // input vector B + static_cast(y_buf.GetDeviceBuffer()), // output vector + m)); std::size_t num_btype = sizeof(XDataType) * m + sizeof(YDataType) * m; diff --git a/example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp b/example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp index 12c97b41ae..5500a8d672 100644 --- a/example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp +++ b/example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp @@ -8,29 +8,32 @@ namespace ck_tile { -template - typename BlockTile, // block size, seq - typename WarpTile, // warp size, seq - typename Vector> // contiguous pixels(vector size) along seq + +// struct that holds the tile size of the block, warp, and vector +// and the number of warps per block +// and the number of threads per warp +// and the number of times the warp tile is repeated in the block tile +// and the block size +template struct MultiplyVector { - static constexpr index_t Block_M = BlockTile::at(number<0>{}); - //static constexpr index_t Block_N = BlockTile::at(number<1>{}); + static constexpr index_t Block_M = BlockTile::at(number<0>{}); static constexpr index_t Warp_M = WarpTile::at(number<0>{}); - //static constexpr index_t Warp_N = WarpTile::at(number<1>{}); static constexpr index_t Vector_M = Vector::at(number<0>{}); - //static constexpr index_t Vector_N = Vector::at(number<1>{}); - static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); - //static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); + static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - //static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + - static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); - //static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); // Number of times the warp tile is repeated in the block tile + static constexpr index_t BlockSize = warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); @@ -48,6 +51,8 @@ struct MultiplyVectorProblem using BlockShape = remove_cvref_t; }; + +// data mapping beween threads and memory struct AddDefaultPolicy { template @@ -56,12 +61,13 @@ struct AddDefaultPolicy using S = typename Problem::BlockShape; return make_static_tile_distribution( tile_distribution_encoding< - sequence<>, - tuple>, - tuple, sequence<1>>, - tuple, sequence<2>>, - sequence<1, 1>, - sequence<0, 3>>{}); + sequence<>, // Replicate + tuple>, // Hierarchical + tuple, sequence<1>>, // Parallel + tuple, sequence<2>>, // Parallel + sequence<1, 1>, // Yield + sequence<0, 3>>{} // Yield + ); } }; @@ -75,12 +81,17 @@ struct MultiplyVectorKernel using ComputeDataType = ck_tile::remove_cvref_t; using YDataType = ck_tile::remove_cvref_t; + + // body of the kernel CK_TILE_DEVICE void operator()(const XDataType* p_x_a, const XDataType* p_x_b, YDataType* p_y, index_t M) const { using S = typename Problem::BlockShape; + // create tensor view for the input and output data, this defines how the data is laid out in memory const auto x_m_n_a = make_naive_tensor_view( - p_x_a, make_tuple(M), make_tuple(1), number{}); + p_x_a, make_tuple(M), make_tuple(1), number{}); // raw pointer, shape of the tensor, stride of the tensor, and lastGarunteedVectorLength + + // lastGarunteedVectorLength --> intuitively, this is the number of elements in the last dimension of the tensor that are guaranteed to be fetched by same thread const auto x_m_n_b = make_naive_tensor_view( p_x_b, make_tuple(M), make_tuple(1), number{}); @@ -88,8 +99,11 @@ struct MultiplyVectorKernel const auto y_m_n = make_naive_tensor_view( p_y, make_tuple(M), make_tuple(1), number{}); + + // origin of the block tile const auto iM = get_block_id() * S::Block_M; + // creating tile windows for the input and output data auto x_window_a = make_tile_window(x_m_n_a, make_tuple(number{}), {iM}, @@ -112,9 +126,9 @@ struct MultiplyVectorKernel - // Process the vector multiplication + // Process the vector add constexpr auto spans = decltype(xa)::get_distributed_spans(); // shape of the tile - sweep_tile_span(spans[number<0>{}], [&](auto idx) { // iterate over the tile // idx+=4 + sweep_tile_span(spans[number<0>{}], [&](auto idx) { // iterate over the tile const auto tile_idx = make_tuple(idx); const auto a_val = type_convert(xa[tile_idx]); const auto b_val = type_convert(xb[tile_idx]);