From 003784b1a30c426998918134b1b8ccbf130da146 Mon Sep 17 00:00:00 2001
From: amd-yashagar <yashagar@amd.com>
Date: Fri, 8 Aug 2025 11:23:35 +0000
Subject: [PATCH] buffer example

---
 example/ck_tile/03_gemm/gemm_reduce.cpp       |   0
 .../buffer_examples.cpp                       | 197 ++++++++++++++----
 .../ops/gemm/kernel/universal_gemm_kernel.hpp |   0
 3 files changed, 162 insertions(+), 35 deletions(-)
 create mode 100644 example/ck_tile/03_gemm/gemm_reduce.cpp
 create mode 100644 include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
diff --git a/example/ck_tile/03_gemm/gemm_reduce.cpp b/example/ck_tile/03_gemm/gemm_reduce.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/ck_tile/22_documentation_examples/buffer_examples.cpp b/example/ck_tile/22_documentation_examples/buffer_examples.cpp
index 0e2b040512..4a3d364c29 100644
--- a/example/ck_tile/22_documentation_examples/buffer_examples.cpp
+++ b/example/ck_tile/22_documentation_examples/buffer_examples.cpp
@@ -1,47 +1,174 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
 #include <cstring>
-#include "ck_tile/ops/documentation_examples/kernel/example_buffer_kernel.hpp"
+#include <iostream>
+#include <vector>
 
-template <typename DataType>
-bool run()
+namespace ck_tile {
+
+struct TileDistributionExample
 {
-    using XDataType       = DataType;
-
-    auto problem_shape = ck_tile::make_tuple(10);
-
-    ck_tile::HostTensor<XDataType> x_host({10}, {1});
-
-    for(int i=0; i<10; i++)
+    CK_TILE_DEVICE void operator()(float* global_data,
+                                   ck_tile::index_t global_shape_0,
+                                   ck_tile::index_t global_shape_1) const
     {
-        x_host.data()[i] = static_cast<XDataType>(i);
+        if(threadIdx.x == 0 && blockIdx.x == 0)
+        {
+            printf("\n=== Tile Distribution Example (Device Kernel) ===\n");
+        }
+        block_sync_lds();
+
+        // Create a tile distribution encoding
+        // This defines how a tensor is distributed across threads
+        auto encoding = tile_distribution_encoding<
+            sequence<>,           // rs_lengths=[] - No replication dimensions
+            tuple<sequence<2, 2>, // hs_lengthss=[[2, 2], [2, 2]] - Hierarchical lengths for each X
+                                  // dimension
+                  sequence<2, 2>>,
+            tuple<sequence<1>, sequence<2>>, // ps_to_rhss_major=[[1], [2]] - P to RH major mappings
+            tuple<sequence<0>, sequence<0>>, // ps_to_rhss_minor=[[0], [0]] - P to RH minor mappings
+            sequence<1, 2>,                  // ys_to_rhs_major=[1, 2] - Y to RH major mappings
+            sequence<1, 1>>{};               // ys_to_rhs_minor=[1, 1] - Y to RH minor mappings
+
+        // Create the tile distribution from the encoding
+        auto distribution = make_static_tile_distribution(encoding);
+
+        // Calculate sizes from the distribution encoding
+        // x0_size = np.prod(distribution.encoding.hs_lengthss[0])
+        constexpr auto hs_lengths_0 = encoding.hs_lengthss_[number<0>{}]; // sequence<2, 2>
+        constexpr auto hs_lengths_1 = encoding.hs_lengthss_[number<1>{}]; // sequence<2, 2>
+
+        constexpr index_t x0_size = reduce_on_sequence(hs_lengths_0, multiplies{}, number<1>{});
+        constexpr index_t x1_size = reduce_on_sequence(hs_lengths_1, multiplies{}, number<1>{});
+
+        // Print distribution info (only from thread 0)
+        if(threadIdx.x == 0 && blockIdx.x == 0)
+        {
+            printf("\n- Tile distribution created:\n");
+            printf("  X dimensions: %d\n", distribution.get_num_of_dimension_x());
+            printf("  Y dimensions: %d\n", distribution.get_num_of_dimension_y());
+            printf("  P dimensions: %d\n", distribution.get_num_of_dimension_p());
+            printf("  X lengths: [%d, %d]\n", x0_size, x1_size);
+        }
+        block_sync_lds();
+
+        // Create packed tensor view (contiguous row-major) using helper
+        auto global_view = make_naive_tensor_view_packed<address_space_enum::global>(
+            global_data, make_tuple(global_shape_0, global_shape_1));
+
+        // Window configuration
+        auto window_lengths = make_tuple(x0_size, x1_size);
+
+        // Get current thread's warp and thread indices
+        index_t warp_id   = threadIdx.x / get_warp_size();
+        index_t thread_id = threadIdx.x % get_warp_size();
+
+        // Window origin - small offset from origin
+        auto window_origin = make_tuple(1, 3); // Small offset from origin
+
+        // Create tile window
+        auto tile_window = make_tile_window(global_view,
+                                            window_lengths,
+                                            {1, 3}, // Window origin as initializer list
+                                            distribution);
+
+        // Load distributed tensor
+        auto distributed_tensor = tile_window.load();
+
+        // Collect values by sweeping through the distributed tensor
+        constexpr index_t max_elements = x0_size * x1_size;
+        float collected_values[max_elements];
+        index_t value_count = 0;
+
+        // Sweep through the distributed tensor and collect values using sweep_tile API
+        sweep_tile(distributed_tensor, [&](auto idx) {
+            if(value_count < max_elements)
+            {
+                collected_values[value_count] = distributed_tensor(idx);
+                value_count++;
+            }
+        });
+
+        // Serialize printing in a fixed order for selected threads only.
+        static constexpr int print_thread_ids[] = {0, 1, 64, 65};
+        for(int sel : print_thread_ids)
+        {
+            block_sync_lds();
+            if(static_cast<int>(threadIdx.x) == sel)
+            {
+                printf("Partition index: (warp=%d, thread=%d)\n",
+                       static_cast<int>(warp_id),
+                       static_cast<int>(thread_id));
+                printf("Collected values: ");
+                for(index_t i = 0; i < value_count; i++)
+                {
+                    printf("%.0f", collected_values[i]);
+                    if(i < value_count - 1)
+                        printf(", ");
+                }
+                printf("\n\n");
+            }
+            block_sync_lds();
+        }
     }
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-
-    constexpr ck_tile::index_t kBlockSize  = 1;
-    constexpr ck_tile::index_t kBlockPerCu = 1;    
-    constexpr ck_tile::index_t kGridSize = 1;
-
-    using Kernel = ck_tile::Docs<XDataType>;
-
-    launch_kernel(ck_tile::stream_config{nullptr, true, 0, 0, 1},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       1,
-                                       kBlockSize,
-                                       0,
-                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
-                                       problem_shape));
-
-    return true;
-}
+};
+} // namespace ck_tile
 
 int main()
 {
-    return run<int>() ? 0 : -2;
+    // Host-side allocation & initialization of pattern data
+    // Reproduce the compile-time sizes used in the kernel: hs_lengths = [2,2] => x sizes=4; global
+    // = 4+5 = 9
+    constexpr ck_tile::index_t global_shape_0 = 9;                               // x0_size(4) + 5
+    constexpr ck_tile::index_t global_shape_1 = 9;                               // x1_size(4) + 5
+    constexpr ck_tile::index_t total_elems    = global_shape_0 * global_shape_1; // 81
+
+    std::vector<float> h_global_data(total_elems);
+    for(ck_tile::index_t i = 0; i < global_shape_0; ++i)
+    {
+        for(ck_tile::index_t j = 0; j < global_shape_1; ++j)
+        {
+            h_global_data[i * global_shape_1 + j] = static_cast<float>(i * 100 + j);
+        }
+    }
+
+    ck_tile::DeviceMem d_global_data(sizeof(float) * total_elems);
+    d_global_data.ToDevice(h_global_data.data());
+
+    std::cout << "\nGlobal data (host print, to be used by device) shape=("
+              << static_cast<int>(global_shape_0) << "," << static_cast<int>(global_shape_1)
+              << ")\n\n";
+    for(ck_tile::index_t i = 0; i < global_shape_0; ++i)
+    {
+        for(ck_tile::index_t j = 0; j < global_shape_1; ++j)
+        {
+            std::cout << h_global_data[i * global_shape_1 + j];
+            if(j + 1 < global_shape_1)
+                std::cout << "\t";
+        }
+        std::cout << '\n';
+    }
+    std::cout << '\n';
+
+    constexpr ck_tile::index_t kBlockSize  = 128;
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    constexpr ck_tile::index_t kGridSize   = 1;
+
+    using Kernel   = ck_tile::TileDistributionExample;
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, 0, 1},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       static_cast<float*>(d_global_data.GetDeviceBuffer()),
+                                       global_shape_0,
+                                       global_shape_1));
+
+    std::cout << "Kernel execution completed. Average time: " << ave_time << " ms" << std::endl;
+
+    return 0;
 }
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
new file mode 100644
index 0000000000..e69de29bb2