diff --git a/example/ck_tile/99_toy_example/01_add/CMakeLists.txt b/example/ck_tile/99_toy_example/01_add/CMakeLists.txt
new file mode 100644
index 0000000000..05bfd67253
--- /dev/null
+++ b/example/ck_tile/99_toy_example/01_add/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(EXAMPLE_REDUCE "add")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding example ${EXAMPLE_REDUCE}")
+
+add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL add.cpp)
+target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_REDUCE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
diff --git a/example/ck_tile/99_toy_example/01_add/add.cpp b/example/ck_tile/99_toy_example/01_add/add.cpp
new file mode 100644
index 0000000000..3646c83cff
--- /dev/null
+++ b/example/ck_tile/99_toy_example/01_add/add.cpp
@@ -0,0 +1,112 @@
+#include "ck_tile/host.hpp"
+#include "reference_add.hpp"
+#include "add.hpp"
+#include <cstring>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "10240", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
+
+    ck_tile::index_t m = arg_parser.get_int("m");
+    ck_tile::index_t n = arg_parser.get_int("n");
+    int do_validation  = arg_parser.get_int("v");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    ck_tile::HostTensor<XDataType> x_host_a({m, n});
+    ck_tile::HostTensor<XDataType> x_host_b({m, n});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n});
+
+    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host_b);
+
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    // 27xx -> 32xx, 1D block to 2D block
+    using BlockWarps = ck_tile::sequence<1, 8>;
+    using BlockTile  = ck_tile::sequence<1, 2048>;
+    using WarpTile   = ck_tile::sequence<1, 256>;
+    using Vector     = ck_tile::sequence<1, 4>;
+    
+    constexpr ck_tile::index_t kBlockSize  = 512;
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
+    std::cout << "block x-size = " << BlockTile::at(ck_tile::number<0>{}) << std::endl;
+    std::cout << "grid size " << kGridSize << std::endl;
+
+    using Shape = ck_tile::AddShape<BlockWarps, BlockTile, WarpTile, Vector>;
+    using Porblem =
+        ck_tile::AddProblem<XDataType, ComputeDataType, YDataType, Shape>;
+
+    using Kernel = ck_tile::Add<Porblem>;
+
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                       static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()),
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                       m,
+                                       n));
+
+    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m * n;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        ck_tile::reference_add<XDataType, YDataType>(
+           x_host_a, x_host_b, y_host_ref);
+        y_buf.FromDevice(y_host_dev.mData.data());
+        pass = ck_tile::check_err(y_host_dev, y_host_ref);
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+}
diff --git a/example/ck_tile/99_toy_example/01_add/add.hpp b/example/ck_tile/99_toy_example/01_add/add.hpp
new file mode 100644
index 0000000000..71d2aca924
--- /dev/null
+++ b/example/ck_tile/99_toy_example/01_add/add.hpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWarps, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WarpTile,   // warp size, seq<M, N>
+          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
+struct AddShape
+{
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
+
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
+
+    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t BlockSize =
+        warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename BlockShape_>
+struct AddProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+};
+
+struct AddDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+};
+
+template <typename Problem_, typename Policy_ = AddDefaultPolicy>
+struct Add
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+
+    CK_TILE_DEVICE void operator()(const XDataType* p_x_a, const XDataType* p_x_b, YDataType* p_y, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
+
+        const auto x_m_n_a = make_naive_tensor_view<address_space_enum::global>(
+            p_x_a, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        
+        const auto x_m_n_b = make_naive_tensor_view<address_space_enum::global>(
+            p_x_b, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        const auto iM = get_block_id() * S::Block_M;
+
+        auto x_window_a = make_tile_window(x_m_n_a,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+        
+        auto x_window_b = make_tile_window(x_m_n_b,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+
+        auto y_window = make_tile_window(y_m_n,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto xa = load_tile(x_window_a);
+            const auto xb = load_tile(x_window_b);
+            auto y_compute = load_tile(y_window);
+
+            constexpr auto spans = decltype(xa)::get_distributed_spans();
+            sweep_tile_span(spans[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx          = ck_tile::make_tuple(idx0, idx1);
+                    const auto x = ck_tile::type_convert<ComputeDataType>(xa[i_j_idx]);
+                    const auto y = ck_tile::type_convert<ComputeDataType>(xb[i_j_idx]);
+                    y_compute(i_j_idx) = x + y;
+                });
+            });
+
+            store_tile(y_window, cast_tile<YDataType>(y_compute));
+            move_tile_window(x_window_a, {0, S::Block_N});
+            move_tile_window(x_window_b, {0, S::Block_N});
+            move_tile_window(y_window, {0, S::Block_N});
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/99_toy_example/01_add/reference_add.hpp b/example/ck_tile/99_toy_example/01_add/reference_add.hpp
new file mode 100644
index 0000000000..26a72286da
--- /dev/null
+++ b/example/ck_tile/99_toy_example/01_add/reference_add.hpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename XDataType, typename YDataType>
+CK_TILE_HOST void
+reference_add(const HostTensor<XDataType>& xa_m_n, const HostTensor<XDataType>& xb_m_n, HostTensor<YDataType>& y_m_n)
+{
+    auto f = [&](auto m) {
+        const int N = xa_m_n.mDesc.get_lengths()[1];
+
+        for(int n = 0; n < N; ++n)
+        {
+            y_m_n(m, n) = ck_tile::type_convert<YDataType>(xa_m_n(m, n)) + ck_tile::type_convert<YDataType>(xb_m_n(m, n));
+        }
+    };
+
+    make_ParallelTensorFunctor(f, y_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
diff --git a/example/ck_tile/99_toy_example/CMakeLists.txt b/example/ck_tile/99_toy_example/CMakeLists.txt
new file mode 100644
index 0000000000..80024d45e8
--- /dev/null
+++ b/example/ck_tile/99_toy_example/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories(AFTER
+  ${CMAKE_CURRENT_LIST_DIR}
+)
+
+add_subdirectory(01_add)
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 7f4ba2ed35..8a66732860 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -18,3 +18,4 @@ add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(35_batched_transpose)
+add_subdirectory(99_toy_example)