From aaa757e36352b1f5d16d21b32fe23c2c820bfbd2 Mon Sep 17 00:00:00 2001
From: AviralGoelAMD <aviral.goel@amd.com>
Date: Thu, 24 Apr 2025 14:24:01 +0000
Subject: [PATCH] fixed function and struct names

---
 .../CMakeLists.txt                            |  4 +--
 .../add_vector.cpp}                           | 25 ++++++++++---------
 .../add_vector.hpp}                           |  9 +++----
 .../reference_add_vector.hpp}                 |  2 +-
 example/ck_tile/99_toy_example/CMakeLists.txt |  2 +-
 5 files changed, 20 insertions(+), 22 deletions(-)
 rename example/ck_tile/99_toy_example/{00_vector_add => 00_add_vector}/CMakeLists.txt (91%)
 rename example/ck_tile/99_toy_example/{00_vector_add/vector_add.cpp => 00_add_vector/add_vector.cpp} (89%)
 rename example/ck_tile/99_toy_example/{00_vector_add/vector_add.hpp => 00_add_vector/add_vector.hpp} (95%)
 rename example/ck_tile/99_toy_example/{00_vector_add/reference_vector_add.hpp => 00_add_vector/reference_add_vector.hpp} (93%)
diff --git a/example/ck_tile/99_toy_example/00_vector_add/CMakeLists.txt b/example/ck_tile/99_toy_example/00_add_vector/CMakeLists.txt
similarity index 91%
rename from example/ck_tile/99_toy_example/00_vector_add/CMakeLists.txt
rename to example/ck_tile/99_toy_example/00_add_vector/CMakeLists.txt
index b6a7886d12..8c1665a1c4 100644
--- a/example/ck_tile/99_toy_example/00_vector_add/CMakeLists.txt
+++ b/example/ck_tile/99_toy_example/00_add_vector/CMakeLists.txt
@@ -1,9 +1,9 @@
-set(EXAMPLE_REDUCE "vector_add")
+set(EXAMPLE_REDUCE "add_vector")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_REDUCE}")
 
-add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL vector_add.cpp)
+add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL add_vector.cpp)
 target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
 
diff --git a/example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp b/example/ck_tile/99_toy_example/00_add_vector/add_vector.cpp
similarity index 89%
rename from example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp
rename to example/ck_tile/99_toy_example/00_add_vector/add_vector.cpp
index b7d9effa76..7aea661b17 100644
--- a/example/ck_tile/99_toy_example/00_vector_add/vector_add.cpp
+++ b/example/ck_tile/99_toy_example/00_add_vector/add_vector.cpp
@@ -1,6 +1,6 @@
 #include "ck_tile/host.hpp"
-#include "reference_vector_add.hpp"
-#include "vector_add.hpp"
+#include "reference_add_vector.hpp"
+#include "add_vector.hpp"
 #include <cstring>
 
 // This example demonstrates how to use the ck_tile library to perform an elementwise vector
@@ -39,10 +39,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
     int repeat         = arg_parser.get_int("repeat");
 
     ck_tile::HostTensor<XDataType> x_host_a(
-        {m}); // length input vector A, if given two arguments m, n the HostTensor will be created
+        {m}); // length input vector A, if given two arguments (m, n) the HostTensor will be created
               // with shape (m, n)
     ck_tile::HostTensor<XDataType> x_host_b(
-        {m}); // length input vector B, if given two arguments m, n the HostTensor will be created
+        {m}); // length input vector B, if given two arguments (m, n) the HostTensor will be created
               // with shape (m, n)
 
     ck_tile::HostTensor<YDataType> y_host_ref({m});
@@ -64,11 +64,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
     x_buf_b.ToDevice(x_host_b.data());
 
     // Dividing the problem into blocktile, warptile, and vector
-    // The blocktile is the size of the tile that will be processed by a single block
-    // The warptile is the size of the tile that will be processed by a single warp
-    // The vector is the size of the tile that will be processed by a single thread
+    // The blocktile is the size of the tile that will be processed by a single thread block (also called work group)
+    // The warptile is the size of the tile that will be processed by a single warp (also called wavefront)
+    // The vector is the size of the tile that will be processed by a single thread (also called work item)
     // The problem is divided into blocks of size BlockTile, each block is further divided into
-    // warps of size WarpTile and each warp is further divided into threads of size Vector
+    // warps of size WarpTile and each warp is composed of 64 or 32 threads of size Vector
+    // each of the thread in a warp will process one vector worth elements of the data
     using BlockTile = ck_tile::sequence<8192>; // Size of the block tile (Entire problem is divided
                                                // into blocks of this size)
     using BlockWarps = ck_tile::sequence<8>; // How many concurrent warps are in a block (Each warp
@@ -91,7 +92,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     std::cout << "block x-size = " << BlockTile::at(ck_tile::number<0>{}) << std::endl;
     std::cout << "grid size " << kGridSize << std::endl;
 
-    using Shape = ck_tile::MultiplyVector<BlockWarps, BlockTile, WarpTile, Vector>;
+    using Shape = ck_tile::AddVectorShape<BlockWarps, BlockTile, WarpTile, Vector>;
     std::cout << "Problem Shape:: M = " << m << std::endl;
     std::cout << "BlockTile: " << BlockTile::at(ck_tile::number<0>{}) << std::endl;
     std::cout << "Number of Blocks in Grid: " << m / BlockTile::at(ck_tile::number<0>{})
@@ -107,14 +108,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     // What is a Problem in CKTile?
     // A Problem defines the shape of the data, the precision of the data
-    using Problem = ck_tile::MultiplyVectorProblem<XDataType, ComputeDataType, YDataType, Shape>;
+    using Problem = ck_tile::AddVectorProblem<XDataType, ComputeDataType, YDataType, Shape>;
 
     // What is a Policy in CKTile?
     // A Policy defines how to map the data between threads and data in memory
 
     // The kernel is the function that will be executed on the device
     // It requires a Problem and Policy to be defined
-    using Kernel = ck_tile::MultiplyVectorKernel<Problem>;
+    using Kernel = ck_tile::AddVectorKernel<Problem>;
 
     // The kernel is launched with the following parameters:
     float ave_time = launch_kernel(
@@ -139,7 +140,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     if(do_validation)
     {
-        ck_tile::reference_vector_add<XDataType, YDataType>(x_host_a, x_host_b, y_host_ref);
+        ck_tile::reference_add_vector<XDataType, YDataType>(x_host_a, x_host_b, y_host_ref);
         y_buf.FromDevice(y_host_dev.mData.data());
         pass = ck_tile::check_err(y_host_dev, y_host_ref);
 
diff --git a/example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp b/example/ck_tile/99_toy_example/00_add_vector/add_vector.hpp
similarity index 95%
rename from example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp
rename to example/ck_tile/99_toy_example/00_add_vector/add_vector.hpp
index b028a5a634..26e569c21e 100644
--- a/example/ck_tile/99_toy_example/00_vector_add/vector_add.hpp
+++ b/example/ck_tile/99_toy_example/00_add_vector/add_vector.hpp
@@ -14,7 +14,7 @@ namespace ck_tile {
 // and the number of times the warp tile is repeated in the block tile
 // and the block size
 template <typename BlockWarps, typename BlockTile, typename WarpTile, typename Vector>
-struct MultiplyVector
+struct AddVectorShape
 {
     static constexpr index_t Block_M = BlockTile::at(number<0>{});
 
@@ -35,7 +35,7 @@ struct MultiplyVector
 };
 
 template <typename XDataType_, typename ComputeDataType_, typename YDataType_, typename BlockShape_>
-struct MultiplyVectorProblem
+struct AddVectorProblem
 {
     using XDataType       = remove_cvref_t<XDataType_>;
     using ComputeDataType = remove_cvref_t<ComputeDataType_>;
@@ -65,7 +65,7 @@ struct AddDefaultPolicy
 };
 
 template <typename Problem_, typename Policy_ = AddDefaultPolicy>
-struct MultiplyVectorKernel
+struct AddVectorKernel
 {
     using Problem = ck_tile::remove_cvref_t<Problem_>;
     using Policy  = ck_tile::remove_cvref_t<Policy_>;
@@ -89,9 +89,6 @@ struct MultiplyVectorKernel
             number<S::Vector_M>{}); // raw pointer, shape of the tensor, stride of the tensor, and
                                     // lastGarunteedVectorLength
 
-        // lastGarunteedVectorLength --> intuitively, this is the number of elements in the last
-        // dimension of the tensor that are guaranteed to be fetched by same thread
-
         const auto x_m_n_b = make_naive_tensor_view<address_space_enum::global>(
             p_x_b, make_tuple(M), make_tuple(1), number<S::Vector_M>{});
 
diff --git a/example/ck_tile/99_toy_example/00_vector_add/reference_vector_add.hpp b/example/ck_tile/99_toy_example/00_add_vector/reference_add_vector.hpp
similarity index 93%
rename from example/ck_tile/99_toy_example/00_vector_add/reference_vector_add.hpp
rename to example/ck_tile/99_toy_example/00_add_vector/reference_add_vector.hpp
index 80acd663d2..4c034797dc 100644
--- a/example/ck_tile/99_toy_example/00_vector_add/reference_vector_add.hpp
+++ b/example/ck_tile/99_toy_example/00_add_vector/reference_add_vector.hpp
@@ -10,7 +10,7 @@
 namespace ck_tile {
 
 template <typename XDataType, typename YDataType>
-CK_TILE_HOST void reference_vector_add(const HostTensor<XDataType>& xa_m_n,
+CK_TILE_HOST void reference_add_vector(const HostTensor<XDataType>& xa_m_n,
                                        const HostTensor<XDataType>& xb_m_n,
                                        HostTensor<YDataType>& y_m_n)
 {
diff --git a/example/ck_tile/99_toy_example/CMakeLists.txt b/example/ck_tile/99_toy_example/CMakeLists.txt
index 49c3677fea..ba225e086c 100644
--- a/example/ck_tile/99_toy_example/CMakeLists.txt
+++ b/example/ck_tile/99_toy_example/CMakeLists.txt
@@ -2,7 +2,7 @@ include_directories(AFTER
   ${CMAKE_CURRENT_LIST_DIR}
 )
 
-add_subdirectory(00_vector_add)
+add_subdirectory(00_add_vector)
 add_subdirectory(01_add)
 add_subdirectory(02_gemm)
 add_subdirectory(03_flash_attention_fwd)