[CK_TILE] Add basic tutorials to separate directory

- Move tutorial_01 through tutorial_07 to example/ck_tile/99_toy_tutorial - Update CMakeLists.txt to reflect new tutorial organization - Remove debug executable from tutorial_07
2026-05-04 13:41:24 +00:00 · 2025-12-08 15:47:01 +00:00
parent 9afbb81e57
commit 576956298c
27 changed files with 5388 additions and 1 deletions
--- a/example/ck_tile/99_toy_tutorial/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/CMakeLists.txt
@@ -0,0 +1,46 @@
+# CK_TILE Tutorial Examples
+# Educational examples for learning ck_tile API
+
+include_directories(AFTER
+  ${CMAKE_CURRENT_LIST_DIR}
+)
+
+# Tutorial Series - Tensor View API
+# Each tutorial builds on the previous one
+
+# Tutorial 01: Tensor Fundamentals - Basic tensor concepts
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_01_tensor_fundamentals/CMakeLists.txt)
+    add_subdirectory(tutorial_01_tensor_fundamentals)
+endif()
+
+# Tutorial 02: Tensor Adaptors - Advanced layout transformations
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_02_tensor_adaptors/CMakeLists.txt)
+    add_subdirectory(tutorial_02_tensor_adaptors)
+endif()
+
+# Tutorial 03: Padding with Tile Windows
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_03_padding_and_tiles/CMakeLists.txt)
+    add_subdirectory(tutorial_03_padding_and_tiles)
+endif()
+
+# Tutorial 04: Descriptor vs Adaptor - Understanding the differences
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_04_descriptor_vs_adaptor/CMakeLists.txt)
+    add_subdirectory(tutorial_04_descriptor_vs_adaptor)
+endif()
+
+# Tutorial 05: Basic Distributed GEMM
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_05_basic_distributed_gemm/CMakeLists.txt)
+    add_subdirectory(tutorial_05_basic_distributed_gemm)
+endif()
+
+# Tutorial 06: Tile Sweeping GEMM
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_06_tile_sweeping_gemm/CMakeLists.txt)
+    add_subdirectory(tutorial_06_tile_sweeping_gemm)
+endif()
+
+# Tutorial 07: Tile Sweeping with Y-Dimension Repetition
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tutorial_07_tile_sweeping_with_y_repetition/CMakeLists.txt)
+    add_subdirectory(tutorial_07_tile_sweeping_with_y_repetition)
+endif()
+
+message(STATUS "CK_TILE Tutorial examples configured")
--- a/example/ck_tile/99_toy_tutorial/tutorial_01_tensor_fundamentals/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_01_tensor_fundamentals/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Tutorial 01: Tensor Fundamentals
+# Complete foundation covering descriptors, views, coordinates, and element access
+
+# Create executable for tensor fundamentals tutorial
+add_executable(aa_tutorial_01_fundamentals tensor_fundamentals.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_01_fundamentals PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+target_compile_options(aa_tutorial_01_fundamentals PRIVATE
+    -Wall
+    -O0
+    -g
+    --save-temps
+)
+
+# Message for build output
+message(STATUS "Added Tutorial 01: Tensor Fundamentals - Complete foundation for ck_tile tensor system")
--- a/example/ck_tile/99_toy_tutorial/tutorial_01_tensor_fundamentals/tensor_fundamentals.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_01_tensor_fundamentals/tensor_fundamentals.cpp
@@ -0,0 +1,603 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Tutorial 01: Tensor Fundamentals - Complete Foundation
+ *
+ * This tutorial teaches the three core concepts of ck_tile tensor system:
+ * 1. Tensor Descriptor - defines tensor layout (lengths + strides)
+ * 2. Tensor View - combines descriptor with memory pointer for access
+ * 3. Tensor Coordinate - multi-dimensional index bound to a descriptor
+ *
+ * Key Learning: ALL access goes through tensor_view API using thread_buffer
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <numeric>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+// Vectorized read functions for easy debugging/disassembly
+// Note: These functions demonstrate the API but may be scalarized by the compiler
+// when returning by value. For true vectorization, use get_vectorized_elements inline.
+
+template<typename DataType>
+CK_TILE_DEVICE thread_buffer<DataType, 2> 
+vectorized_read_2(const DataType* p_data, index_t offset)
+{
+    auto view = make_naive_tensor_view<address_space_enum::global>(
+        p_data,
+        make_tuple(12),  // total elements
+        make_tuple(1),   // stride
+        number<2>{},     // GuaranteedLastDimensionVectorLength
+        number<1>{}      // GuaranteedLastDimensionVectorStride
+    );
+    auto desc = view.get_tensor_descriptor();
+    auto coord = make_tensor_coordinate(desc, make_tuple(offset));
+    return view.template get_vectorized_elements<thread_buffer<DataType, 2>>(coord, 0);
+}
+
+template<typename DataType>
+CK_TILE_DEVICE thread_buffer<DataType, 4> 
+vectorized_read_4(const DataType* p_data, index_t offset)
+{
+    auto view = make_naive_tensor_view<address_space_enum::global>(
+        p_data,
+        make_tuple(12),  // total elements
+        make_tuple(1),   // stride
+        number<4>{},     // GuaranteedLastDimensionVectorLength
+        number<1>{}      // GuaranteedLastDimensionVectorStride
+    );
+    auto desc = view.get_tensor_descriptor();
+    auto coord = make_tensor_coordinate(desc, make_tuple(offset));
+    return view.template get_vectorized_elements<thread_buffer<DataType, 4>>(coord, 0);
+}
+
+template<typename DataType>
+CK_TILE_DEVICE void 
+vectorized_write_4(DataType* p_data, index_t offset, thread_buffer<DataType, 4> buffer)
+{
+    auto view = make_naive_tensor_view<address_space_enum::global>(
+        p_data,
+        make_tuple(12),  // total elements
+        make_tuple(1)    // stride
+    );
+    auto desc = view.get_tensor_descriptor();
+    auto coord = make_tensor_coordinate(desc, make_tuple(offset));
+    view.set_vectorized_elements(coord, 0, buffer);
+}
+
+// Additional functions with fp16 to demonstrate vectorization with smaller types
+CK_TILE_DEVICE thread_buffer<half_t, 4> 
+vectorized_read_4_fp16(const half_t* p_data, index_t offset)
+{
+    auto view = make_naive_tensor_view<address_space_enum::global>(
+        p_data,
+        make_tuple(24),  // total elements (more for fp16)
+        make_tuple(1)    // stride
+    );
+    auto desc = view.get_tensor_descriptor();
+    auto coord = make_tensor_coordinate(desc, make_tuple(offset));
+    return view.template get_vectorized_elements<thread_buffer<half_t, 4>>(coord, 0);
+}
+
+CK_TILE_DEVICE thread_buffer<half_t, 8> 
+vectorized_read_8_fp16(const half_t* p_data, index_t offset)
+{
+    auto view = make_naive_tensor_view<address_space_enum::global>(
+        p_data,
+        make_tuple(24),  // total elements
+        make_tuple(1)    // stride
+    );
+    auto desc = view.get_tensor_descriptor();
+    auto coord = make_tensor_coordinate(desc, make_tuple(offset));
+    return view.template get_vectorized_elements<thread_buffer<half_t, 8>>(coord, 0);
+}
+
+// The kernel that demonstrates all fundamental concepts
+template<typename DataType>
+struct TensorFundamentalsKernel
+{
+    static constexpr index_t kBlockSize = 64;
+
+    CK_TILE_DEVICE void operator()(const DataType* p_input,
+                                   DataType* p_output,
+                                   index_t H, index_t W, index_t C) const
+    {
+        // Only thread 0 for clean output
+        if(get_thread_id() != 0) return;
+
+        printf("\n=== TENSOR FUNDAMENTALS IN CK_TILE ===\n\n");
+
+        //==================================================================
+        // PART 1: TENSOR DESCRIPTOR
+        //==================================================================
+        printf("PART 1: TENSOR DESCRIPTOR\n");
+        printf("-------------------------\n");
+
+        // A tensor descriptor defines the layout of a tensor
+        // It contains: lengths (shape) + strides (memory layout)
+
+        // Create a descriptor for [H,W,C] tensor in row-major layout
+        auto hwc_descriptor = make_naive_tensor_descriptor(
+            make_tuple(H, W, C),        // lengths: [2, 3, 2]
+            make_tuple(W*C, C, 1)       // strides: [6, 2, 1] for row-major
+        );
+
+        // Access descriptor properties
+        auto lengths = hwc_descriptor.get_lengths();
+        // Note: Descriptors don't expose strides directly after transformation
+
+        printf("Descriptor for [H=%ld, W=%ld, C=%ld] tensor:\n",
+               static_cast<long>(H), static_cast<long>(W), static_cast<long>(C));
+        printf("  Lengths: [%ld, %ld, %ld]\n",
+               static_cast<long>(lengths.at(number<0>{})),
+               static_cast<long>(lengths.at(number<1>{})),
+               static_cast<long>(lengths.at(number<2>{})));
+        printf("  Strides: [%ld, %ld, %ld] (row-major)\n",
+               static_cast<long>(W*C), static_cast<long>(C), static_cast<long>(1));
+        printf("  Memory formula: offset = h*%ld + w*%ld + c*%ld\n\n",
+               static_cast<long>(W*C), static_cast<long>(C), static_cast<long>(1));
+
+        //==================================================================
+        // PART 2: TENSOR VIEW - Three Creation Methods
+        //==================================================================
+        printf("PART 2: TENSOR VIEW (Descriptor + Memory)\n");
+        printf("-----------------------------------------\n");
+
+        // Method 1: Explicit strides (most control)
+        printf("Method 1: make_naive_tensor_view with explicit strides\n");
+        auto view1 = make_naive_tensor_view<address_space_enum::global>(
+            p_input,                    // GPU memory pointer
+            make_tuple(H, W, C),        // lengths
+            make_tuple(W*C, C, 1)       // explicit strides
+        );
+        printf("  Created view with shape [%ld,%ld,%ld] and strides [%ld,%ld,%ld]\n",
+               static_cast<long>(H), static_cast<long>(W), static_cast<long>(C),
+               static_cast<long>(W*C), static_cast<long>(C), static_cast<long>(1));
+
+        // Method 2: Packed/contiguous (auto-computes row-major strides)
+        printf("\nMethod 2: make_naive_tensor_view_packed (auto strides)\n");
+        auto view2 = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_input,                    // GPU memory pointer
+            make_tuple(H, W, C)         // lengths only, strides auto-computed
+        );
+        printf("  Created packed view - strides computed automatically\n");
+        printf("  For row-major: last dim stride=1, each dim stride = next_dim_stride * next_dim_length\n");
+
+        // Method 3: From existing descriptor
+        printf("\nMethod 3: make_tensor_view from descriptor\n");
+        auto view3 = make_tensor_view<address_space_enum::global>(
+            p_input,                    // GPU memory pointer
+            hwc_descriptor              // existing descriptor
+        );
+        printf("  Created view using pre-existing descriptor\n");
+
+        // Demonstrate all three views access the same data
+        printf("\nVerifying all three methods create equivalent views:\n");
+        {
+            auto coord_test = make_tensor_coordinate(
+                view1.get_tensor_descriptor(), make_tuple(0, 1, 0));
+            auto val1 = view1.template get_vectorized_elements<thread_buffer<DataType, 1>>(
+                coord_test, 0)[number<0>{}];
+
+            auto coord_test2 = make_tensor_coordinate(
+                view2.get_tensor_descriptor(), make_tuple(0, 1, 0));
+            auto val2 = view2.template get_vectorized_elements<thread_buffer<DataType, 1>>(
+                coord_test2, 0)[number<0>{}];
+
+            auto coord_test3 = make_tensor_coordinate(
+                view3.get_tensor_descriptor(), make_tuple(0, 1, 0));
+            auto val3 = view3.template get_vectorized_elements<thread_buffer<DataType, 1>>(
+                coord_test3, 0)[number<0>{}];
+
+            printf("  view1[0,1,0] = %.0f (explicit strides)\n", static_cast<float>(val1));
+            printf("  view2[0,1,0] = %.0f (packed/auto strides)\n", static_cast<float>(val2));
+            printf("  view3[0,1,0] = %.0f (from descriptor)\n", static_cast<float>(val3));
+            printf("  ✓ All three methods produce identical views!\n\n");
+        }
+
+        //==================================================================
+        // PART 3: TENSOR COORDINATE
+        //==================================================================
+        printf("PART 3: TENSOR COORDINATE (Multi-dim Indexing)\n");
+        printf("-----------------------------------------------\n");
+
+        // Coordinates are multi-dimensional indices bound to a descriptor
+        // They know how to map to linear memory offsets
+
+        auto desc = view1.get_tensor_descriptor();
+
+        // Create coordinate for position [1,2,0]
+        auto coord = make_tensor_coordinate(desc, make_tuple(1, 2, 0));
+
+        // Coordinate can compute its linear offset
+        index_t offset = coord.get_offset();
+        printf("Coordinate [1,2,0] maps to linear offset: %ld\n",
+               static_cast<long>(offset));
+        printf("  Calculation: 1*%ld + 2*%ld + 0*%ld = %ld\n\n",
+               static_cast<long>(W*C), static_cast<long>(C),
+               static_cast<long>(1), static_cast<long>(offset));
+
+        //==================================================================
+        // PART 4: ELEMENT ACCESS - The Critical Pattern
+        //==================================================================
+        printf("PART 4: ELEMENT ACCESS (thread_buffer Pattern)\n");
+        printf("----------------------------------------------\n");
+        printf("CRITICAL: get_vectorized_elements returns thread_buffer, NOT value!\n\n");
+
+        // Reading elements - THE CORRECT PATTERN
+        printf("Reading element at [0,0,0]:\n");
+        {
+            auto read_coord = make_tensor_coordinate(desc, make_tuple(0, 0, 0));
+
+            // get_vectorized_elements returns thread_buffer<T,N>, not T!
+            auto buffer = view1.template get_vectorized_elements<thread_buffer<DataType, 1>>(
+                read_coord,    // coordinate
+                0              // linear_offset (usually 0)
+            );
+
+            // Extract actual value from thread_buffer
+            DataType value = buffer[number<0>{}];
+
+            printf("  Step 1: Create coordinate for [0,0,0]\n");
+            printf("  Step 2: Call get_vectorized_elements -> returns thread_buffer\n");
+            printf("  Step 3: Extract value with [number<0>{}]\n");
+            printf("  Value at [0,0,0] = %.0f\n\n", static_cast<float>(value));
+        }
+
+        // Writing elements - THE CORRECT PATTERN
+        printf("Writing element at [0,0,1]:\n");
+        {
+            auto write_coord = make_tensor_coordinate(desc, make_tuple(0, 0, 1));
+
+            // Create thread_buffer for writing
+            thread_buffer<DataType, 1> write_buffer;
+            write_buffer[number<0>{}] = 99.0f;
+
+            // Write to output view
+            auto output_view = make_naive_tensor_view<address_space_enum::global>(
+                p_output,
+                make_tuple(H, W, C),
+                make_tuple(W*C, C, 1)
+            );
+
+            output_view.set_vectorized_elements(write_coord, 0, write_buffer);
+
+            printf("  Step 1: Create thread_buffer with value 99\n");
+            printf("  Step 2: Create coordinate for [0,0,1]\n");
+            printf("  Step 3: Call set_vectorized_elements with buffer\n");
+            printf("  Written value 99 to output[0,0,1]\n\n");
+        }
+
+        //==================================================================
+        // PART 4.5: VECTORIZED ACCESS - Reading Multiple Elements
+        //==================================================================
+        printf("PART 4.5: VECTORIZED ACCESS (Performance Optimization)\n");
+        printf("-------------------------------------------------------\n");
+        printf("CRITICAL: Vectorization reads/writes multiple elements in one operation!\n\n");
+
+        // Create a flattened view for easier vectorized access
+        auto flat_view = make_naive_tensor_view<address_space_enum::global>(
+            p_input,
+            make_tuple(H*W*C),  // [12] - all elements in linear order
+            make_tuple(1)       // stride = 1 (contiguous)
+        );
+        auto flat_desc = flat_view.get_tensor_descriptor();
+
+        // Example 1: Reading 2 elements at once (vector size = 2)
+        printf("Example 1: Reading 2 elements at once\n");
+        {
+            // Call the vectorized_read_2 function (easy to disassemble in debugger)
+            auto buffer = vectorized_read_2(p_input, 0);
+            
+            DataType val0 = buffer[number<0>{}];
+            DataType val1 = buffer[number<1>{}];
+            
+            printf("  Position [0]: Read 2 elements in one operation\n");
+            printf("    buffer[0] = %.0f\n", static_cast<float>(val0));
+            printf("    buffer[1] = %.0f\n", static_cast<float>(val1));
+            printf("  ✓ 2x faster than reading elements individually!\n");
+            printf("  ✓ In debugger: 'disassemble vectorized_read_2<float>'\n\n");
+        }
+
+        // Example 2: Reading 4 elements at once (vector size = 4)
+        printf("Example 2: Reading 4 elements at once\n");
+        {
+            // Call the vectorized_read_4 function (easy to disassemble in debugger)
+            auto buffer = vectorized_read_4(p_input, 4);
+            
+            printf("  Position [4]: Read 4 elements in one operation\n");
+            printf("    buffer[0] = %.0f\n", static_cast<float>(buffer[number<0>{}]));
+            printf("    buffer[1] = %.0f\n", static_cast<float>(buffer[number<1>{}]));
+            printf("    buffer[2] = %.0f\n", static_cast<float>(buffer[number<2>{}]));
+            printf("    buffer[3] = %.0f\n", static_cast<float>(buffer[number<3>{}]));
+            printf("  ✓ 4x faster than reading elements individually!\n");
+            printf("  ✓ In debugger: 'disassemble vectorized_read_4<float>'\n\n");
+        }
+
+        // Example 3: Writing vectorized data
+        printf("Example 3: Writing multiple elements at once\n");
+        {
+            // Create a buffer with 4 values
+            thread_buffer<DataType, 4> write_buffer;
+            write_buffer[number<0>{}] = 100.0f;
+            write_buffer[number<1>{}] = 101.0f;
+            write_buffer[number<2>{}] = 102.0f;
+            write_buffer[number<3>{}] = 103.0f;
+            
+            // Call the vectorized_write_4 function (easy to disassemble in debugger)
+            vectorized_write_4(p_output, 4, write_buffer);
+            
+            printf("  Position [4-7]: Wrote 4 elements in one operation\n");
+            printf("    Wrote: 100, 101, 102, 103\n");
+            printf("  ✓ 4x faster than writing elements individually!\n");
+            printf("  ✓ In debugger: 'disassemble vectorized_write_4<float>'\n\n");
+        }
+
+        // Example 4: Vectorized copy operation (TRUE VECTORIZATION!)
+        printf("Example 4: Vectorized copy - INLINE usage (TRUE vectorization)\n");
+        {
+            auto output_flat_view = make_naive_tensor_view<address_space_enum::global>(
+                p_output,
+                make_tuple(H*W*C),
+                make_tuple(1)
+            );
+            auto out_flat_desc = output_flat_view.get_tensor_descriptor();
+            
+            // Copy first 8 elements using vector size 4 (2 iterations)
+            // THIS is where real vectorization happens - inline, no function calls!
+            printf("  Copying first 8 elements using 2 vectorized operations:\n");
+            for(index_t i = 0; i < 8; i += 4) {
+                auto in_coord = make_tensor_coordinate(flat_desc, make_tuple(i));
+                auto out_coord = make_tensor_coordinate(out_flat_desc, make_tuple(i));
+                
+                // Read 4 elements - INLINE vectorized load (not through function)
+                auto buffer = flat_view.template get_vectorized_elements<
+                    thread_buffer<DataType, 4>>(in_coord, 0);
+                
+                // Write 4 elements (skip positions 4-7 which we already wrote)
+                if(i != 4) {
+                    output_flat_view.set_vectorized_elements(out_coord, 0, buffer);
+                }
+                
+                printf("    Iteration %ld: Copied elements [%ld-%ld]\n", 
+                       static_cast<long>(i/4), static_cast<long>(i), static_cast<long>(i+3));
+            }
+            printf("  ✓ Copied 8 elements with only 2 memory operations!\n");
+            printf("  ✓ THIS loop shows true vectorization in assembly!\n\n");
+        }
+
+        printf("Vectorization Key Points:\n");
+        printf("  • Vector sizes: 1, 2, 4, 8 (powers of 2)\n");
+        printf("  • Requires contiguous memory layout (stride=1 in access dimension)\n");
+        printf("  • Dramatically improves memory bandwidth utilization\n");
+        printf("  • Essential for high-performance GPU kernels\n");
+        printf("  • Access each element with buffer[number<i>{}]\n");
+        printf("  • IMPORTANT: Use inline for true vectorization, not function calls!\n");
+        printf("  • Standalone functions may be scalarized when returning by value\n\n");
+
+        //==================================================================
+        // PART 5: MULTIPLE VIEWS OF SAME DATA
+        //==================================================================
+        printf("PART 5: MULTIPLE VIEWS OF SAME DATA\n");
+        printf("------------------------------------\n");
+
+        // Create two different views of the same memory
+        // View A: [H, W, C] = [2, 3, 2]
+        auto view_hwc = make_naive_tensor_view<address_space_enum::global>(
+            p_input,
+            make_tuple(H, W, C),
+            make_tuple(W*C, C, 1)
+        );
+
+        // View B: [HW, C] = [6, 2] - flattened spatial dimensions
+        auto view_hw_c = make_naive_tensor_view<address_space_enum::global>(
+            p_input,
+            make_tuple(H*W, C),
+            make_tuple(C, 1)
+        );
+
+        printf("Two views of same memory:\n");
+        printf("  View A: [H=%ld, W=%ld, C=%ld]\n",
+               static_cast<long>(H), static_cast<long>(W), static_cast<long>(C));
+        printf("  View B: [HW=%ld, C=%ld]\n",
+               static_cast<long>(H*W), static_cast<long>(C));
+
+        // Show they access the same data
+        printf("\nAccessing same element through different views:\n");
+
+        // Access element at h=1, w=1, c=0 through View A
+        auto desc_a = view_hwc.get_tensor_descriptor();
+        auto coord_a = make_tensor_coordinate(desc_a, make_tuple(1, 1, 0));
+        auto buffer_a = view_hwc.template get_vectorized_elements<thread_buffer<DataType, 1>>(
+            coord_a, 0);
+        DataType val_a = buffer_a[number<0>{}];
+
+        // Access same element through View B at hw=4 (1*3+1), c=0
+        auto desc_b = view_hw_c.get_tensor_descriptor();
+        auto coord_b = make_tensor_coordinate(desc_b, make_tuple(4, 0));
+        auto buffer_b = view_hw_c.template get_vectorized_elements<thread_buffer<DataType, 1>>(
+            coord_b, 0);
+        DataType val_b = buffer_b[number<0>{}];
+
+        printf("  View A[1,1,0] = %.0f\n", static_cast<float>(val_a));
+        printf("  View B[4,0]   = %.0f (same value!)\n", static_cast<float>(val_b));
+        printf("  Both access offset %ld in memory\n\n",
+               static_cast<long>(coord_a.get_offset()));
+
+        //==================================================================
+        // PART 6: PRACTICAL EXAMPLE - Copy with Views
+        //==================================================================
+        printf("PART 6: PRACTICAL EXAMPLE - Copy Data\n");
+        printf("--------------------------------------\n");
+
+        // Create output view
+        auto output_view = make_naive_tensor_view<address_space_enum::global>(
+            p_output,
+            make_tuple(H, W, C),
+            make_tuple(W*C, C, 1)
+        );
+        auto out_desc = output_view.get_tensor_descriptor();
+
+        // Copy all elements using tensor_view API
+        index_t count = 0;
+        for(index_t h = 0; h < H; h++) {
+            for(index_t w = 0; w < W; w++) {
+                for(index_t c = 0; c < C; c++) {
+                    // Read from input
+                    auto in_coord = make_tensor_coordinate(desc, make_tuple(h, w, c));
+                    auto in_buffer = view1.template get_vectorized_elements<
+                        thread_buffer<DataType, 1>>(in_coord, 0);
+
+                    // Write to output (except [0,0,1] which we already wrote as 99)
+                    if(!(h == 0 && w == 0 && c == 1)) {
+                        auto out_coord = make_tensor_coordinate(out_desc, make_tuple(h, w, c));
+                        output_view.set_vectorized_elements(out_coord, 0, in_buffer);
+                    }
+                    count++;
+                }
+            }
+        }
+        printf("Copied %ld elements using tensor_view API\n", static_cast<long>(count));
+        printf("Note: output[0,0,1] = 99 (modified), rest copied from input\n\n");
+
+        //==================================================================
+        // SUMMARY
+        //==================================================================
+        printf("=== KEY TAKEAWAYS ===\n");
+        printf("1. Descriptor = Lengths + Strides (defines layout)\n");
+        printf("2. View = Descriptor + Memory (enables access)\n");
+        printf("3. Coordinate = Multi-dim index bound to descriptor\n");
+        printf("4. ALWAYS: get_vectorized_elements returns thread_buffer!\n");
+        printf("5. ALWAYS: Extract value with [number<0>{}], [number<1>{}], etc.\n");
+        printf("6. Vectorization: Use thread_buffer<T,N> with N=2,4,8 for performance\n");
+        printf("7. NEVER: Access memory directly - use tensor_view API\n\n");
+    }
+};
+
+int main()
+{
+    std::cout << "\n================================================\n";
+    std::cout << "Tutorial 01: Tensor Fundamentals\n";
+    std::cout << "================================================\n\n";
+
+    // Initialize HIP
+    int device_count;
+    hip_check_error(hipGetDeviceCount(&device_count));
+    if(device_count == 0) {
+        std::cerr << "No GPU devices found!\n";
+        return 1;
+    }
+
+    hip_check_error(hipSetDevice(0));
+    hipDeviceProp_t props;
+    hip_check_error(hipGetDeviceProperties(&props, 0));
+    std::cout << "Using GPU: " << props.name << "\n";
+
+    // Small tensor for demonstration
+    constexpr index_t H = 2;
+    constexpr index_t W = 3;
+    constexpr index_t C = 2;
+    constexpr index_t size = H * W * C;
+
+    std::cout << "\nTensor configuration:\n";
+    std::cout << "  Shape: [" << H << ", " << W << ", " << C << "]\n";
+    std::cout << "  Total elements: " << size << "\n";
+    std::cout << "  Layout: Row-major (strides = [" << W*C << ", " << C << ", 1])\n\n";
+
+    // Create test data: 1, 2, 3, 4, ... 12
+    std::vector<float> h_input(size);
+    std::iota(h_input.begin(), h_input.end(), 1.0f);
+
+    std::cout << "Input data (row-major memory order):\n";
+    for(index_t i = 0; i < size; ++i) {
+        if(i % C == 0 && i > 0) std::cout << " | ";
+        std::cout << std::setw(2) << h_input[i] << " ";
+    }
+    std::cout << "\n";
+
+    std::cout << "\nLogical view [H,W,C]:\n";
+    for(index_t h = 0; h < H; h++) {
+        std::cout << "  H=" << h << ": ";
+        for(index_t w = 0; w < W; w++) {
+            std::cout << "[";
+            for(index_t c = 0; c < C; c++) {
+                index_t idx = h * W * C + w * C + c;
+                std::cout << std::setw(2) << h_input[idx];
+                if(c < C-1) std::cout << ",";
+            }
+            std::cout << "] ";
+        }
+        std::cout << "\n";
+    }
+
+    // Allocate device memory
+    DeviceMem d_input(size * sizeof(float));
+    DeviceMem d_output(size * sizeof(float));
+
+    // Copy input to device
+    d_input.ToDevice(h_input.data(), size * sizeof(float));
+
+    // Initialize output to zeros
+    std::vector<float> h_zeros(size, 0.0f);
+    d_output.ToDevice(h_zeros.data(), size * sizeof(float));
+
+    // Launch kernel
+    constexpr index_t block_size = TensorFundamentalsKernel<float>::kBlockSize;
+    stream_config stream;
+
+    std::cout << "\nLaunching kernel...\n";
+    std::cout << "=====================================\n";
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     TensorFundamentalsKernel<float>{},
+                     dim3(1),          // 1 block
+                     dim3(block_size), // 64 threads
+                     0,                // no shared memory
+                     static_cast<const float*>(d_input.GetDeviceBuffer()),
+                     static_cast<float*>(d_output.GetDeviceBuffer()),
+                     H, W, C));
+
+    hip_check_error(hipDeviceSynchronize());
+    std::cout << "=====================================\n";
+
+    // Copy output back
+    std::vector<float> h_output(size);
+    d_output.FromDevice(h_output.data(), size * sizeof(float));
+
+    // Verify results
+    std::cout << "\nOutput verification:\n";
+    bool passed = true;
+    for(index_t i = 0; i < size; ++i) {
+        float expected = (i == 1) ? 99.0f : h_input[i];  // We wrote 99 to position [0,0,1]
+        if(std::abs(h_output[i] - expected) > 1e-6f) {
+            passed = false;
+            std::cout << "  ✗ Mismatch at index " << i
+                     << ": expected " << expected
+                     << ", got " << h_output[i] << "\n";
+        }
+    }
+
+    if(passed) {
+        std::cout << "  ✓ All elements correct!\n";
+        std::cout << "  ✓ output[0,0,1] = 99 (modified as expected)\n";
+        std::cout << "  ✓ All other elements copied correctly\n";
+    }
+
+    std::cout << "\n=== Tutorial Complete ===\n";
+    std::cout << "You now understand:\n";
+    std::cout << "- Tensor descriptors (layout definition)\n";
+    std::cout << "- Tensor views (memory access abstraction)\n";
+    std::cout << "- Tensor coordinates (multi-dimensional indexing)\n";
+    std::cout << "- The thread_buffer pattern for element access\n";
+    std::cout << "- Vectorized access with thread_buffer<T,N> for performance\n";
+    std::cout << "- Creating multiple views of the same data\n\n";
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_02_tensor_adaptors/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_02_tensor_adaptors/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Tutorial 02: Tensor Adaptors
+# Advanced layout transformations using make_single_stage_tensor_adaptor,
+# transform_tensor_adaptor, and chain_tensor_adaptors
+
+# Create executable for tensor adaptors tutorial
+add_executable(aa_tutorial_02_adaptors tensor_adaptors.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_02_adaptors PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+target_compile_options(aa_tutorial_02_adaptors PRIVATE
+    -Wall
+    -O0
+    -g
+    --save-temps
+)
+
+# Message for build output
+message(STATUS "Added Tutorial 02: Tensor Adaptors - Advanced layout transformations with adaptor methods")
--- a/example/ck_tile/99_toy_tutorial/tutorial_02_tensor_adaptors/tensor_adaptors.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_02_tensor_adaptors/tensor_adaptors.cpp
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Tutorial 02: Tensor Adaptors - Advanced Layout Transformations
+ *
+ * This tutorial teaches the three core tensor adaptor methods:
+ * 1. make_single_stage_tensor_adaptor - Create a single-stage transformation
+ * 2. transform_tensor_adaptor - Add new transformations to existing adaptor
+ * 3. chain_tensor_adaptors - Chain two adaptors together
+ *
+ * Key Learning: Tensor adaptors enable zero-copy view transformations for
+ * complex memory layouts used in high-performance GPU kernels.
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <numeric>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TensorAdaptorsKernel
+{
+    static constexpr index_t kBlockSize = 64;
+
+    // Part 1: make_single_stage_tensor_adaptor examples
+    CK_TILE_DEVICE static void demonstrate_single_stage()
+    {
+        printf("PART 1: make_single_stage_tensor_adaptor\n");
+        printf("=========================================\n\n");
+
+        printf("Purpose: Create a tensor adaptor with transformations applied in a single stage.\n");
+        printf("This is the foundation for building complex layout transformations.\n\n");
+
+        // Example 1.1: Simple dimension split (Unmerge)
+        printf("Example 1.1: Split M dimension for tiling\n");
+        printf("------------------------------------------\n");
+        {
+            constexpr index_t M = 128;
+            constexpr index_t K = 64;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 32;
+
+            printf("Input layout: [M=%ld, K=%ld]\n", static_cast<long>(M), static_cast<long>(K));
+            printf("Goal: Split M into [M0=%ld, M1=%ld] for tiling\n", 
+                   static_cast<long>(M0), static_cast<long>(M1));
+
+            auto transforms = make_tuple(
+                make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                make_pass_through_transform(number<K>{})
+            );
+
+            auto lower_dims = make_tuple(sequence<0>{}, sequence<1>{});
+            auto upper_dims = make_tuple(sequence<0, 1>{}, sequence<2>{});
+
+            auto adaptor = make_single_stage_tensor_adaptor(
+                transforms, lower_dims, upper_dims
+            );
+
+            printf("\nAdaptor created:\n");
+            printf("  Input: [M, K] = [%ld, %ld]\n", 
+                   static_cast<long>(M), static_cast<long>(K));
+            printf("  Output: [M0, M1, K] = [%ld, %ld, %ld]\n",
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K));
+
+            auto top_idx = make_tuple(1, 16, 32);
+            auto bottom_idx = adaptor.calculate_bottom_index(top_idx);
+            
+            printf("\nTest: [M0=1, M1=16, K=32] -> [M=%ld, K=%ld]\n",
+                   static_cast<long>(bottom_idx[number<0>{}]),
+                   static_cast<long>(bottom_idx[number<1>{}]));
+        }
+
+        printf("\n");
+
+        // Example 1.2: Interleaved layout
+        printf("Example 1.2: GEMM C Matrix Tiling (Interleaved)\n");
+        printf("------------------------------------------------\n");
+        {
+            constexpr index_t M = 256;
+            constexpr index_t N = 256;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 64;
+            constexpr index_t N0 = 4;
+            constexpr index_t N1 = 64;
+
+            printf("Input: [M=%ld, N=%ld]\n", static_cast<long>(M), static_cast<long>(N));
+            printf("Output: [M0=%ld, N0=%ld, M1=%ld, N1=%ld] (interleaved)\n",
+                   static_cast<long>(M0), static_cast<long>(N0),
+                   static_cast<long>(M1), static_cast<long>(N1));
+
+            auto transforms = make_tuple(
+                make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                make_unmerge_transform(make_tuple(number<N0>{}, number<N1>{}))
+            );
+
+            auto lower_dims = make_tuple(sequence<0>{}, sequence<1>{});
+            auto upper_dims = make_tuple(
+                sequence<0, 2>{},  // M splits to dims 0,2
+                sequence<1, 3>{}   // N splits to dims 1,3
+            );
+
+            auto adaptor = make_single_stage_tensor_adaptor(
+                transforms, lower_dims, upper_dims
+            );
+
+            auto top_idx = make_tuple(2, 3, 16, 32);
+            auto bottom_idx = adaptor.calculate_bottom_index(top_idx);
+            printf("\nTest: [M0=2, N0=3, M1=16, N1=32] -> [M=%ld, N=%ld]\n",
+                   static_cast<long>(bottom_idx[number<0>{}]),
+                   static_cast<long>(bottom_idx[number<1>{}]));
+        }
+
+        printf("\n\n");
+    }
+
+    // Part 2: transform_tensor_adaptor examples
+    CK_TILE_DEVICE static void demonstrate_transform()
+    {
+        printf("PART 2: transform_tensor_adaptor\n");
+        printf("=================================\n\n");
+
+        printf("Purpose: Add new transformations to an existing tensor adaptor.\n\n");
+
+        // Example 2.1: Two-stage transformation
+        printf("Example 2.1: Two-Stage Hierarchical Tiling\n");
+        printf("-------------------------------------------\n");
+        {
+            constexpr index_t M = 256;
+            constexpr index_t K = 128;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 64;
+            constexpr index_t K0 = 4;
+            constexpr index_t K1 = 32;
+
+            printf("Stage 1: [M=%ld, K=%ld] -> [M0=%ld, M1=%ld, K=%ld]\n",
+                   static_cast<long>(M), static_cast<long>(K),
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K));
+
+            auto stage1_adaptor = make_single_stage_tensor_adaptor(
+                make_tuple(
+                    make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                    make_pass_through_transform(number<K>{})
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0, 1>{}, sequence<2>{})
+            );
+
+            printf("Stage 2: [M0=%ld, M1=%ld, K=%ld] -> [M0=%ld, M1=%ld, K0=%ld, K1=%ld]\n",
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K),
+                   static_cast<long>(M0), static_cast<long>(M1),
+                   static_cast<long>(K0), static_cast<long>(K1));
+
+            auto final_adaptor = transform_tensor_adaptor(
+                stage1_adaptor,
+                make_tuple(
+                    make_pass_through_transform(number<M0>{}),
+                    make_pass_through_transform(number<M1>{}),
+                    make_unmerge_transform(make_tuple(number<K0>{}, number<K1>{}))
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2, 3>{})
+            );
+
+            auto top_idx = make_tuple(2, 32, 3, 16);
+            auto bottom_idx = final_adaptor.calculate_bottom_index(top_idx);
+            printf("\nTest: [M0=2, M1=32, K0=3, K1=16] -> [M=%ld, K=%ld]\n",
+                   static_cast<long>(bottom_idx[number<0>{}]),
+                   static_cast<long>(bottom_idx[number<1>{}]));
+        }
+
+        printf("\n\n");
+    }
+
+    // Part 3: chain_tensor_adaptors examples
+    CK_TILE_DEVICE static void demonstrate_chain()
+    {
+        printf("PART 3: chain_tensor_adaptors\n");
+        printf("==============================\n\n");
+
+        printf("Purpose: Chain two tensor adaptors sequentially.\n\n");
+
+        printf("Example 3.1: Chain Two Adaptors\n");
+        printf("--------------------------------\n");
+        {
+            constexpr index_t M = 128;
+            constexpr index_t K = 64;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 32;
+            constexpr index_t K0 = 4;
+            constexpr index_t K1 = 16;
+
+            printf("Adaptor A: [M=%ld, K=%ld] -> [M0=%ld, M1=%ld, K=%ld]\n",
+                   static_cast<long>(M), static_cast<long>(K),
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K));
+
+            auto adaptor_a = make_single_stage_tensor_adaptor(
+                make_tuple(
+                    make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                    make_pass_through_transform(number<K>{})
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0, 1>{}, sequence<2>{})
+            );
+
+            printf("Adaptor B: [M0=%ld, M1=%ld, K=%ld] -> [M0=%ld, M1=%ld, K0=%ld, K1=%ld]\n",
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K),
+                   static_cast<long>(M0), static_cast<long>(M1),
+                   static_cast<long>(K0), static_cast<long>(K1));
+
+            auto adaptor_b = make_single_stage_tensor_adaptor(
+                make_tuple(
+                    make_pass_through_transform(number<M0>{}),
+                    make_pass_through_transform(number<M1>{}),
+                    make_unmerge_transform(make_tuple(number<K0>{}, number<K1>{}))
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2, 3>{})
+            );
+
+            auto chained = chain_tensor_adaptors(adaptor_a, adaptor_b);
+
+            printf("\nChained: [M=%ld, K=%ld] -> [M0=%ld, M1=%ld, K0=%ld, K1=%ld]\n",
+                   static_cast<long>(M), static_cast<long>(K),
+                   static_cast<long>(M0), static_cast<long>(M1),
+                   static_cast<long>(K0), static_cast<long>(K1));
+
+            auto top_idx = make_tuple(2, 16, 3, 8);
+            auto bottom_idx = chained.calculate_bottom_index(top_idx);
+            printf("Test: [M0=2, M1=16, K0=3, K1=8] -> [M=%ld, K=%ld]\n",
+                   static_cast<long>(bottom_idx[number<0>{}]),
+                   static_cast<long>(bottom_idx[number<1>{}]));
+        }
+
+        printf("\n\n");
+    }
+
+    // Part 4: Real-world GEMM example
+    CK_TILE_DEVICE static void demonstrate_gemm_tiling()
+    {
+        printf("PART 4: Real-World GEMM Tiling Example\n");
+        printf("=======================================\n\n");
+
+        constexpr index_t M = 256;
+        constexpr index_t N = 256;
+        constexpr index_t MWaves = 4;
+        constexpr index_t NWaves = 4;
+        constexpr index_t MPerXDL = 16;
+        constexpr index_t NPerXDL = 16;
+        constexpr index_t M0 = M / (MWaves * MPerXDL);
+        constexpr index_t N0 = N / (NWaves * NPerXDL);
+
+        printf("GEMM C Matrix: [M=%ld, N=%ld]\n", 
+               static_cast<long>(M), static_cast<long>(N));
+        printf("Tiling: [M0=%ld, N0=%ld, M1=%ld, N1=%ld, M2=%ld, N2=%ld]\n",
+               static_cast<long>(M0), static_cast<long>(N0),
+               static_cast<long>(MWaves), static_cast<long>(NWaves),
+               static_cast<long>(MPerXDL), static_cast<long>(NPerXDL));
+
+        auto adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(
+                make_unmerge_transform(make_tuple(number<M0>{}, number<MWaves>{}, number<MPerXDL>{})),
+                make_unmerge_transform(make_tuple(number<N0>{}, number<NWaves>{}, number<NPerXDL>{}))
+            ),
+            make_tuple(sequence<0>{}, sequence<1>{}),
+            make_tuple(sequence<0, 2, 4>{}, sequence<1, 3, 5>{})
+        );
+
+        auto top_idx = make_tuple(2, 3, 1, 2, 8, 12);
+        auto bottom_idx = adaptor.calculate_bottom_index(top_idx);
+        printf("\nTest: [M0=2, N0=3, M1=1, N1=2, M2=8, N2=12] -> [M=%ld, N=%ld]\n",
+               static_cast<long>(bottom_idx[number<0>{}]),
+               static_cast<long>(bottom_idx[number<1>{}]));
+
+        printf("\n\n");
+    }
+
+    // Part 5: Padding Transform - Coordinate mapping demonstration
+    CK_TILE_DEVICE static void demonstrate_padding_transform(const DataType* p_data)
+    {
+        printf("PART 5: Padding Transform - Virtual Padding\n");
+        printf("============================================\n\n");
+
+        printf("Demonstrating padding transform with coordinate mapping.\n\n");
+
+        // Original size: 10 elements, pad to 16
+        constexpr index_t OrigSize = 10;
+        constexpr index_t PadRight = 6;
+        constexpr index_t TotalSize = OrigSize + PadRight;
+
+        printf("Original size: %ld elements\n", static_cast<long>(OrigSize));
+        printf("Padding: +%ld elements (right)\n", static_cast<long>(PadRight));
+        printf("Total size: %ld elements\n\n", static_cast<long>(TotalSize));
+
+        // Create padded descriptor
+        auto desc_padded = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(number<OrigSize>{})),
+            make_tuple(make_right_pad_transform(number<OrigSize>{}, number<PadRight>{})),
+            make_tuple(sequence<0>{}),
+            make_tuple(sequence<0>{})
+        );
+
+        printf("Coordinate mapping and memory reads:\n");
+        printf("------------------------------------\n\n");
+
+        printf("Real area (indices 0-9):\n");
+        for(index_t i = 0; i < OrigSize; i++) {
+            auto coord = make_tensor_coordinate(desc_padded, make_tuple(i));
+            index_t offset = coord.get_offset();
+            DataType val = p_data[offset];
+            
+            printf("  Index %ld -> offset %ld -> value %.1f (real data)\n",
+                   static_cast<long>(i), static_cast<long>(offset), static_cast<float>(val));
+        }
+
+        printf("\nPadded area (indices 10-15):\n");
+        for(index_t i = OrigSize; i < TotalSize; i++) {
+            auto coord = make_tensor_coordinate(desc_padded, make_tuple(i));
+            index_t offset = coord.get_offset();
+            DataType val = p_data[offset];
+            
+            printf("  Index %ld -> offset %ld -> value %.1f (wraps around)\n",
+                   static_cast<long>(i), static_cast<long>(offset), static_cast<float>(val));
+        }
+
+        printf("\nKey Observations:\n");
+        printf("  - Real area (0-9): Maps to offsets 0-9, returns actual data\n");
+        printf("  - Padded area (10-15): Offsets wrap (modulo), reads same data\n");
+        printf("  - Padding is virtual - no extra memory allocated\n");
+        printf("  - In production (pooling/conv), buffer_view with identity value returns 0\n");
+        printf("  - Common use: Pad irregular sizes to match tile boundaries\n\n");
+    }
+
+    // Part 6: Replicate Transform with comprehensive coordinate testing
+    CK_TILE_DEVICE static void demonstrate_replicate_transform()
+    {
+        printf("PART 5: Replicate Transform - Broadcasting Dimensions\n");
+        printf("======================================================\n\n");
+
+        printf("Demonstrating replicate transform with complete coordinate mapping.\n\n");
+
+        // Start with flattened 1D tensor
+        constexpr index_t Size = 16;  // H*W = 2*8
+
+        printf("Step 1: Create initial 1D descriptor [Size=%ld]\n", static_cast<long>(Size));
+
+        auto desc = make_naive_tensor_descriptor_packed(
+            make_tuple(number<Size>{})
+        );
+
+        printf("  Initial: [16] (flattened)\n\n");
+
+        // Stage 1: Replicate + Unmerge
+        printf("Step 2: Apply Replicate and Unmerge\n");
+        printf("  Transform 0: Replicate (no input) -> [Rep0=8]\n");
+        printf("  Transform 1: Unmerge [16] -> [Dim0=8, Dim1=2]\n");
+
+        auto desc_stage1 = transform_tensor_descriptor(
+            desc,
+            make_tuple(
+                make_replicate_transform(make_tuple(number<8>{})),  // Broadcast to 8
+                make_unmerge_transform(make_tuple(number<8>{}, number<2>{}))  // Split 16 -> [8,2]
+            ),
+            make_tuple(sequence<>{}, sequence<0>{}),  // Replicate has no input, Unmerge uses dim 0
+            make_tuple(sequence<0>{}, sequence<1, 2>{})  // Rep0=dim0, Unmerge produces dims 1,2
+        );
+
+        printf("\n  After Stage 1: [Rep0=8, Dim0=8, Dim1=2]\n");
+        printf("  Total: 3 dimensions\n\n");
+
+        // Stage 2: Merge Rep0 with Dim0
+        printf("Step 3: Merge [Rep0, Dim0] -> [Merged=64]\n");
+
+        auto desc_final = transform_tensor_descriptor(
+            desc_stage1,
+            make_tuple(
+                make_merge_transform(make_tuple(number<8>{}, number<8>{})),  // Merge Rep0, Dim0
+                make_pass_through_transform(number<2>{})  // Dim1 unchanged
+            ),
+            make_tuple(sequence<0, 1>{}, sequence<2>{}),  // Merge dims 0,1; pass-through dim 2
+            make_tuple(sequence<0>{}, sequence<1>{})  // Output: [Merged, Dim1]
+        );
+
+        printf("\n  Final: [Merged=64, Dim1=2]\n\n");
+
+        // Comprehensive coordinate testing - ALL coordinates
+        printf("COORDINATE MAPPING TEST - ALL %ld coordinates:\n",
+               static_cast<long>(64 * 2));
+        printf("=======================================================\n");
+        printf("Format: [Merged, Dim1] -> memory_offset\n\n");
+
+        auto lengths_final = desc_final.get_lengths();
+        index_t merged_len = lengths_final[number<0>{}];
+        index_t dim1_len = lengths_final[number<1>{}];
+
+        printf("Descriptor: [Merged=%ld, Dim1=%ld] = %ld total coordinates\n",
+               static_cast<long>(merged_len), static_cast<long>(dim1_len),
+               static_cast<long>(merged_len * dim1_len));
+        printf("Memory: Only 16 locations (broadcasting effect!)\n\n");
+
+        // Print ALL coordinates to show broadcasting pattern
+        index_t count = 0;
+        for(index_t merged = 0; merged < merged_len; merged++) {
+            for(index_t dim1 = 0; dim1 < dim1_len; dim1++) {
+                auto coord = make_tensor_coordinate(desc_final, make_tuple(merged, dim1));
+                index_t offset = coord.get_offset();
+                printf("  [%2ld, %ld] -> offset %2ld",
+                       static_cast<long>(merged), static_cast<long>(dim1),
+                       static_cast<long>(offset));
+                
+                // Add newline every 4 coordinates for readability
+                count++;
+                if(count % 4 == 0) {
+                    printf("\n");
+                } else {
+                    printf("  |  ");
+                }
+            }
+        }
+        if(count % 4 != 0) printf("\n");
+
+        printf("\nKey Observations:\n");
+        printf("  - Total coordinates: %ld (Merged=%ld × Dim1=%ld)\n",
+               static_cast<long>(merged_len * dim1_len),
+               static_cast<long>(merged_len), static_cast<long>(dim1_len));
+        printf("  - Memory locations: 16 (original size)\n");
+        printf("  - Broadcasting ratio: %ld:1 (each memory location accessed by %ld coordinates)\n",
+               static_cast<long>((merged_len * dim1_len) / 16),
+               static_cast<long>((merged_len * dim1_len) / 16));
+        printf("  - Replicate dimension creates virtual coordinates without memory cost!\n\n");
+    }
+
+    CK_TILE_DEVICE void operator()(const DataType* p_data) const
+    {
+        if(get_thread_id() != 0) return;
+
+        printf("\n=== TENSOR ADAPTORS IN CK_TILE ===\n\n");
+
+        demonstrate_single_stage();
+        demonstrate_transform();
+        demonstrate_chain();
+        demonstrate_gemm_tiling();
+        demonstrate_padding_transform(p_data);
+        demonstrate_replicate_transform();
+
+        printf("=== KEY TAKEAWAYS ===\n\n");
+        printf("1. make_single_stage_tensor_adaptor:\n");
+        printf("   - Creates adaptor with transformations in one stage\n");
+        printf("   - Foundation for all tensor layout transformations\n\n");
+
+        printf("2. transform_tensor_adaptor:\n");
+        printf("   - Adds new transformations to existing adaptor\n");
+        printf("   - Enables incremental building of complex layouts\n\n");
+
+        printf("3. chain_tensor_adaptors:\n");
+        printf("   - Composes two (or more) adaptors sequentially\n");
+        printf("   - Enables modular transformation design\n\n");
+
+        printf("4. Replicate transform:\n");
+        printf("   - Broadcasts dimensions (creates from nothing)\n");
+        printf("   - Useful for repeating data across tiles\n\n");
+
+        printf("5. All transformations are zero-copy views!\n\n");
+    }
+};
+
+int main()
+{
+    std::cout << "\n================================================\n";
+    std::cout << "Tutorial 02: Tensor Adaptors\n";
+    std::cout << "================================================\n\n";
+
+    int device_count;
+    hip_check_error(hipGetDeviceCount(&device_count));
+    if(device_count == 0) {
+        std::cerr << "No GPU devices found!\n";
+        return 1;
+    }
+
+    hip_check_error(hipSetDevice(0));
+    hipDeviceProp_t props;
+    hip_check_error(hipGetDeviceProperties(&props, 0));
+    std::cout << "Using GPU: " << props.name << "\n";
+
+    // Allocate data for padding example (16 elements, but only first 10 have real data)
+    constexpr index_t data_size = 16;
+    std::vector<float> h_data(data_size, 0.0f);  // Initialize all to 0
+    std::iota(h_data.begin(), h_data.begin() + 10, 1.0f);  // First 10: 1,2,3,...,10
+
+    std::cout << "\nTest data (first 10 real, last 6 padding zeros): ";
+    for(size_t i = 0; i < h_data.size(); i++) {
+        std::cout << h_data[i];
+        if(i < h_data.size() - 1) std::cout << " ";
+    }
+    std::cout << "\n";
+
+    DeviceMem d_data(data_size * sizeof(float));
+    d_data.ToDevice(h_data.data(), data_size * sizeof(float));
+
+    constexpr index_t block_size = TensorAdaptorsKernel<float>::kBlockSize;
+    stream_config stream;
+
+    std::cout << "\nLaunching kernel...\n";
+    std::cout << "=====================================\n";
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     TensorAdaptorsKernel<float>{},
+                     dim3(1),
+                     dim3(block_size),
+                     0,
+                     static_cast<const float*>(d_data.GetDeviceBuffer())));
+
+    hip_check_error(hipDeviceSynchronize());
+    std::cout << "=====================================\n";
+
+    std::cout << "\n=== Tutorial Complete ===\n";
+    std::cout << "You now understand:\n";
+    std::cout << "- make_single_stage_tensor_adaptor for basic transformations\n";
+    std::cout << "- transform_tensor_adaptor for incremental building\n";
+    std::cout << "- chain_tensor_adaptors for composing transformations\n";
+    std::cout << "- Padding transform with actual get_vectorized_elements reads\n";
+    std::cout << "- Replicate transform with broadcasting\n";
+    std::cout << "- Real-world GEMM tiling patterns\n\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_03_padding_and_tiles/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_03_padding_and_tiles/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Tutorial 03: Padding with Tile Windows
+# Demonstrates buffer_view + identity, tile_window, and load_tile with padding
+
+# Create executable for padding with tiles tutorial
+add_executable(aa_tutorial_03_padding padding_with_tiles.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_03_padding PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+target_compile_options(aa_tutorial_03_padding PRIVATE
+    -Wall
+    -O0
+    -g
+    --save-temps
+)
+
+# Message for build output
+message(STATUS "Added Tutorial 03: Padding with Tile Windows - buffer_view + identity + load_tile pattern")
--- a/example/ck_tile/99_toy_tutorial/tutorial_03_padding_and_tiles/padding_with_tiles.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_03_padding_and_tiles/padding_with_tiles.cpp
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Tutorial 03: Padding with Tile Windows
+ *
+ * This tutorial demonstrates the proper way to use padding transforms with:
+ * 1. buffer_view with identity values
+ * 2. tile_window for tiled access
+ * 3. load_tile with automatic padding handling
+ *
+ * Key Learning: This is the pattern used in pooling and convolution kernels
+ * to handle out-of-bounds accesses gracefully.
+ */
+
+#include <iostream>
+#include <vector>
+#include <numeric>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct PaddingTileKernel
+{
+    static constexpr index_t kBlockSize = 64;
+
+    CK_TILE_DEVICE void operator()(const DataType* p_data,
+                                   index_t orig_size,
+                                   index_t padded_size) const
+    {
+        if(get_thread_id() != 0) return;
+
+        printf("\n=== PADDING WITH TILE WINDOWS ===\n\n");
+
+        printf("Original size: %ld\n", static_cast<long>(orig_size));
+        printf("Padded size: %ld\n\n", static_cast<long>(padded_size));
+
+        // Step 1: Create original descriptor (runtime)
+        auto desc_orig = make_naive_tensor_descriptor(
+            make_tuple(orig_size),
+            make_tuple(1)
+        );
+
+        // Step 2: Apply padding transform
+        index_t pad_amount = padded_size - orig_size;
+        auto desc_padded = transform_tensor_descriptor(
+            desc_orig,
+            make_tuple(make_right_pad_transform(orig_size, pad_amount)),
+            make_tuple(sequence<0>{}),
+            make_tuple(sequence<0>{})
+        );
+
+        auto tensor_simple = make_tensor_view<address_space_enum::global>(
+            p_data,
+            desc_padded
+        );
+
+        printf("Created tensor_view (simple API, no identity value)\n");
+        printf("  - Padded reads will wrap around to existing data\n\n");
+
+        // Step 5: Read tiles using get_vectorized_elements
+        constexpr index_t tile_size = 8;
+
+        printf("Reading tiles of size %ld using get_vectorized_elements:\n\n", 
+               static_cast<long>(tile_size));
+
+        // Load tiles covering the entire padded range
+        index_t num_tiles = (padded_size + tile_size - 1) / tile_size;
+        
+        for(index_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+            // Use get_vectorized_elements directly on tensor_view
+            printf("Tile %ld (indices %ld-%ld):\n",
+                   static_cast<long>(tile_idx),
+                   static_cast<long>(tile_idx * tile_size),
+                   static_cast<long>(tile_idx * tile_size + tile_size - 1));
+            
+            printf("  Values: ");
+            // Use static_for to access elements with compile-time indices
+            static_for<0, tile_size, 1>{}([&](auto i) {
+                index_t global_idx = tile_idx * tile_size + i;
+                auto coord = make_tensor_coordinate(desc_padded, make_tuple(global_idx));
+                auto buffer = tensor_simple.template get_vectorized_elements<
+                    thread_buffer<DataType, 1>>(coord, 0);
+                // static_for<0, 4, 1>{}([&](auto j) {
+                //     DataType val = buffer[number<j>{}];
+                //     printf("%.1f ", static_cast<float>(val));
+                // });
+                DataType val = buffer[number<0>{}];
+                printf("%.1f ", static_cast<float>(val));
+            });
+            printf("\n");
+            
+            // Check if this tile contains padding
+            index_t tile_start = tile_idx * tile_size;
+            index_t tile_end = tile_start + tile_size;
+            if(tile_end > orig_size) {
+                printf("  Note: Elements %ld-%ld are padded (return identity value 0.0)\n",
+                       static_cast<long>(orig_size - tile_start),
+                       static_cast<long>(tile_size - 1));
+            }
+            printf("\n");
+        }
+
+        printf("Key Observations:\n");
+        printf("  - buffer_view with runtime size + identity value works!\n");
+        printf("  - Out-of-bounds accesses return identity value (0.0)\n");
+        printf("  - get_vectorized_elements properly handles padding\n");
+        printf("  - This is the pattern used in pooling/convolution kernels\n\n");
+
+    }
+};
+
+int main()
+{
+    std::cout << "\n================================================\n";
+    std::cout << "Tutorial 03: Padding with Tile Windows\n";
+    std::cout << "================================================\n\n";
+
+    int device_count;
+    hip_check_error(hipGetDeviceCount(&device_count));
+    if(device_count == 0) {
+        std::cerr << "No GPU devices found!\n";
+        return 1;
+    }
+
+    hip_check_error(hipSetDevice(0));
+    hipDeviceProp_t props;
+    hip_check_error(hipGetDeviceProperties(&props, 0));
+    std::cout << "Using GPU: " << props.name << "\n";
+
+    // Create test data: 10 real elements
+    constexpr index_t orig_size = 10;
+    constexpr index_t padded_size = 16;
+    
+    std::vector<float> h_data(orig_size);
+    std::iota(h_data.begin(), h_data.end(), 1.0f);  // 1, 2, 3, ..., 10
+
+    std::cout << "\nTest data (" << orig_size << " elements): ";
+    for(auto val : h_data) {
+        std::cout << val << " ";
+    }
+    std::cout << "\n";
+    std::cout << "Will be padded to " << padded_size << " elements\n";
+
+    DeviceMem d_data(orig_size * sizeof(float));
+    d_data.ToDevice(h_data.data(), orig_size * sizeof(float));
+
+    constexpr index_t block_size = PaddingTileKernel<float>::kBlockSize;
+    stream_config stream;
+
+    std::cout << "\nLaunching kernel...\n";
+    std::cout << "=====================================\n";
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     PaddingTileKernel<float>{},
+                     dim3(1),
+                     dim3(block_size),
+                     0,
+                     static_cast<const float*>(d_data.GetDeviceBuffer()),
+                     orig_size,
+                     padded_size));
+
+    hip_check_error(hipDeviceSynchronize());
+    std::cout << "=====================================\n";
+
+    std::cout << "\n=== Tutorial Complete ===\n";
+    std::cout << "You now understand:\n";
+    std::cout << "- buffer_view with identity values for padding\n";
+    std::cout << "- tile_window for tiled access patterns\n";
+    std::cout << "- load_tile automatically handling out-of-bounds\n";
+    std::cout << "- The pattern used in pooling/convolution kernels\n\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_04_descriptor_vs_adaptor/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_04_descriptor_vs_adaptor/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Tutorial 04: Tensor Descriptor vs Tensor Adaptor
+# Demonstrates the differences between tensor_adaptor and tensor_descriptor,
+# including coordinate operations and when to use each
+
+# Create executable for descriptor vs adaptor tutorial
+add_executable(aa_tutorial_04_descriptor_vs_adaptor descriptor_vs_adaptor.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_04_descriptor_vs_adaptor PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+target_compile_options(aa_tutorial_04_descriptor_vs_adaptor PRIVATE
+    -Wall
+    -O0
+    -g
+    --save-temps
+)
+
+# Message for build output
+message(STATUS "Added Tutorial 04: Descriptor vs Adaptor - Understanding tensor_adaptor and tensor_descriptor")
--- a/example/ck_tile/99_toy_tutorial/tutorial_04_descriptor_vs_adaptor/descriptor_vs_adaptor.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_04_descriptor_vs_adaptor/descriptor_vs_adaptor.cpp
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Tutorial 04: Tensor Descriptor vs Tensor Adaptor
+ *
+ * This tutorial demonstrates the key differences between tensor_adaptor and tensor_descriptor:
+ * 1. tensor_adaptor - Pure coordinate transformation logic
+ * 2. tensor_descriptor - Complete tensor specification with memory info
+ * 3. Coordinate operations - Creating and moving coordinates efficiently
+ * 4. Practical examples showing when to use each
+ *
+ * Key Learning: Understanding the relationship and use cases for adaptors vs descriptors
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct DescriptorVsAdaptorKernel
+{
+    static constexpr index_t kBlockSize = 64;
+
+    // Part 1: Tensor Adaptor - Pure Transformation Logic
+    CK_TILE_DEVICE static void demonstrate_tensor_adaptor()
+    {
+        printf("PART 1: tensor_adaptor - Pure Coordinate Transformation\n");
+        printf("========================================================\n\n");
+
+        printf("Purpose: Define HOW to transform coordinates without memory information.\n\n");
+
+        // Example 1.1: Simple tiling transformation
+        printf("Example 1.1: Matrix Tiling [M, K] -> [M0, M1, K]\n");
+        printf("------------------------------------------------\n");
+        {
+            constexpr index_t M = 128;
+            constexpr index_t K = 64;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 32;
+
+            printf("Input: [M=%ld, K=%ld]\n", static_cast<long>(M), static_cast<long>(K));
+            printf("Output: [M0=%ld, M1=%ld, K=%ld]\n", 
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K));
+
+            // Create adaptor - only transformation logic
+            auto adaptor = make_single_stage_tensor_adaptor(
+                make_tuple(
+                    make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                    make_pass_through_transform(number<K>{})
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0, 1>{}, sequence<2>{})
+            );
+
+            printf("\nAdaptor properties:\n");
+            printf("  - Stores: Transformation logic only\n");
+            printf("  - Does NOT store: Memory size, vectorization info\n");
+            printf("  - Can do: Map coordinates [M0, M1, K] -> [M, K]\n");
+            printf("  - Cannot do: Calculate memory offsets\n\n");
+
+            // Test coordinate mapping
+            auto top_idx = make_tuple(2, 16, 32);
+            auto bottom_idx = adaptor.calculate_bottom_index(top_idx);
+            
+            printf("Coordinate mapping test:\n");
+            printf("  Input:  [M0=%ld, M1=%ld, K=%ld]\n",
+                   static_cast<long>(top_idx.template get<0>()),
+                   static_cast<long>(top_idx.template get<1>()),
+                   static_cast<long>(top_idx.template get<2>()));
+            printf("  Output: [M=%ld, K=%ld]\n",
+                   static_cast<long>(bottom_idx[number<0>{}]),
+                   static_cast<long>(bottom_idx[number<1>{}]));
+            printf("  Calculation: M = M0*M1 + M1 = 2*32 + 16 = %ld\n",
+                   static_cast<long>(bottom_idx[number<0>{}]));
+        }
+
+        printf("\n");
+
+        // Example 1.2: Reusable transformation pattern
+        printf("Example 1.2: Reusable Transformation Pattern\n");
+        printf("---------------------------------------------\n");
+        {
+            printf("Adaptors are reusable - same transformation for different sizes!\n\n");
+
+            // Define a generic 2D tiling pattern
+            auto create_tiling_adaptor = [](auto M0, auto M1, auto N0, auto N1) {
+                return make_single_stage_tensor_adaptor(
+                    make_tuple(
+                        make_unmerge_transform(make_tuple(M0, M1)),
+                        make_unmerge_transform(make_tuple(N0, N1))
+                    ),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0, 1>{}, sequence<2, 3>{})
+                );
+            };
+
+            // Use same pattern for different matrix sizes
+            [[maybe_unused]] auto adaptor_64x64 = create_tiling_adaptor(
+                number<4>{}, number<16>{}, number<4>{}, number<16>{}
+            );
+            
+            [[maybe_unused]] auto adaptor_128x128 = create_tiling_adaptor(
+                number<8>{}, number<16>{}, number<8>{}, number<16>{}
+            );
+
+            printf("Created two adaptors with same pattern:\n");
+            printf("  - 64x64 matrix:   [64, 64] -> [4, 16, 4, 16]\n");
+            printf("  - 128x128 matrix: [128, 128] -> [8, 16, 8, 16]\n");
+            printf("\nBoth use identical transformation logic!\n");
+        }
+
+        printf("\n\n");
+    }
+
+    // Part 2: Tensor Descriptor - Complete Specification
+    CK_TILE_DEVICE static void demonstrate_tensor_descriptor()
+    {
+        printf("PART 2: tensor_descriptor - Complete Tensor Specification\n");
+        printf("==========================================================\n\n");
+
+        printf("Purpose: Complete tensor with transformation + memory + vectorization info.\n\n");
+
+        // Example 2.1: Creating a descriptor
+        printf("Example 2.1: Creating a Descriptor\n");
+        printf("-----------------------------------\n");
+        {
+            constexpr index_t M = 128;
+            constexpr index_t K = 64;
+
+            // Create descriptor - includes memory information
+            auto desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<M>{}, number<K>{})
+            );
+
+            printf("Created descriptor for [M=%ld, K=%ld] matrix\n",
+                   static_cast<long>(M), static_cast<long>(K));
+
+            auto space_size = desc.get_element_space_size();
+            printf("\nDescriptor properties:\n");
+            printf("  - Stores: Transformation logic + memory info\n");
+            printf("  - element_space_size: %ld elements\n", static_cast<long>(space_size));
+            printf("  - Can calculate: Actual memory offsets\n");
+            printf("  - Includes: Vectorization guarantees\n\n");
+
+            // Calculate memory offset
+            auto offset1 = desc.calculate_offset(make_tuple(10, 20));
+            auto offset2 = desc.calculate_offset(make_tuple(0, 0));
+            auto offset3 = desc.calculate_offset(make_tuple(M-1, K-1));
+
+            printf("Memory offset calculations:\n");
+            printf("  [10, 20] -> offset %ld (10*64 + 20)\n", static_cast<long>(offset1));
+            printf("  [0, 0]   -> offset %ld (first element)\n", static_cast<long>(offset2));
+            printf("  [%ld, %ld] -> offset %ld (last element)\n",
+                   static_cast<long>(M-1), static_cast<long>(K-1), static_cast<long>(offset3));
+        }
+
+        printf("\n");
+
+        // Example 2.2: Transforming a Descriptor
+        printf("Example 2.2: Transforming a Descriptor\n");
+        printf("---------------------------------------\n");
+        {
+            constexpr index_t M = 256;
+            constexpr index_t K = 128;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 64;
+
+            printf("Step 1: Create initial descriptor\n");
+            auto desc_initial = make_naive_tensor_descriptor_packed(
+                make_tuple(number<M>{}, number<K>{})
+            );
+            printf("  Initial: [M=%ld, K=%ld]\n", static_cast<long>(M), static_cast<long>(K));
+            printf("  Memory size: %ld elements\n\n", 
+                   static_cast<long>(desc_initial.get_element_space_size()));
+
+            printf("Step 2: Transform to add tiling\n");
+            auto desc_tiled = transform_tensor_descriptor(
+                desc_initial,
+                make_tuple(
+                    make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                    make_pass_through_transform(number<K>{})
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0, 1>{}, sequence<2>{})
+            );
+
+            printf("  Transformed: [M, K] -> [M0=%ld, M1=%ld, K=%ld]\n",
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K));
+            printf("  Memory size preserved: %ld elements\n\n",
+                   static_cast<long>(desc_tiled.get_element_space_size()));
+
+            printf("Now we can calculate actual memory offsets!\n");
+            auto offset = desc_tiled.calculate_offset(make_tuple(2, 16, 32));
+            printf("  [M0=2, M1=16, K=32] -> offset %ld\n", static_cast<long>(offset));
+        }
+
+        printf("\n\n");
+    }
+
+    // Part 3: Coordinate Operations
+    CK_TILE_DEVICE static void demonstrate_coordinate_operations()
+    {
+        printf("PART 3: Coordinate Operations - Creating and Moving\n");
+        printf("====================================================\n\n");
+
+        printf("Purpose: Efficiently track and update positions in tensor space.\n\n");
+
+        // Example 3.1: Creating coordinates
+        printf("Example 3.1: Creating Coordinates\n");
+        printf("----------------------------------\n");
+        {
+            constexpr index_t M = 64;
+            constexpr index_t K = 32;
+
+            auto desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<M>{}, number<K>{})
+            );
+
+            printf("Descriptor: [M=%ld, K=%ld]\n\n", static_cast<long>(M), static_cast<long>(K));
+
+            // Create coordinate at position [10, 20]
+            auto coord = make_tensor_coordinate(desc, make_tuple(10, 20));
+
+            printf("Created coordinate at [10, 20]:\n");
+            printf("  - Top index (user view): [%ld, %ld]\n",
+                   static_cast<long>(coord.get_index()[number<0>{}]),
+                   static_cast<long>(coord.get_index()[number<1>{}]));
+            printf("  - Memory offset: %ld\n", static_cast<long>(coord.get_offset()));
+            printf("  - Calculation: 10*32 + 20 = %ld\n", static_cast<long>(coord.get_offset()));
+        }
+
+        printf("\n");
+
+        // Example 3.2: Moving coordinates (efficient iteration)
+        printf("Example 3.2: Moving Coordinates - Efficient Iteration\n");
+        printf("------------------------------------------------------\n");
+        {
+            constexpr index_t M = 64;
+            constexpr index_t K = 32;
+
+            auto desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<M>{}, number<K>{})
+            );
+
+            printf("Scenario: Iterate through a row of tiles\n");
+            printf("Descriptor: [M=%ld, K=%ld]\n\n", static_cast<long>(M), static_cast<long>(K));
+
+            // Start at [0, 0]
+            auto coord = make_tensor_coordinate(desc, make_tuple(0, 0));
+            printf("Initial position [0, 0]:\n");
+            printf("  Offset: %ld\n\n", static_cast<long>(coord.get_offset()));
+
+            // Move to [0, 8] - move by 8 in K dimension
+            printf("Move by [0, 8] (8 columns to the right):\n");
+            move_tensor_coordinate(desc, coord, make_tuple(0, 8));
+            printf("  New position: [%ld, %ld]\n",
+                   static_cast<long>(coord.get_index()[number<0>{}]),
+                   static_cast<long>(coord.get_index()[number<1>{}]));
+            printf("  New offset: %ld\n", static_cast<long>(coord.get_offset()));
+            printf("  (Much faster than creating new coordinate!)\n\n");
+
+            // Move to [1, 8] - move by 1 in M dimension
+            printf("Move by [1, 0] (1 row down):\n");
+            move_tensor_coordinate(desc, coord, make_tuple(1, 0));
+            printf("  New position: [%ld, %ld]\n",
+                   static_cast<long>(coord.get_index()[number<0>{}]),
+                   static_cast<long>(coord.get_index()[number<1>{}]));
+            printf("  New offset: %ld\n", static_cast<long>(coord.get_offset()));
+            printf("  Calculation: 1*32 + 8 = %ld\n\n", static_cast<long>(coord.get_offset()));
+
+            printf("Why use move_tensor_coordinate?\n");
+            printf("  ✓ Incremental update - only recalculates what changed\n");
+            printf("  ✓ Skips unnecessary transformations (optimization)\n");
+            printf("  ✓ Essential for tile window iteration patterns\n");
+            printf("  ✓ Much faster than creating new coordinates\n");
+        }
+
+        printf("\n");
+
+        // Example 3.3: Moving with complex transformations
+        printf("Example 3.3: Moving Coordinates with Transformations\n");
+        printf("----------------------------------------------------\n");
+        {
+            constexpr index_t M = 128;
+            constexpr index_t K = 64;
+            constexpr index_t M0 = 4;
+            constexpr index_t M1 = 32;
+
+            // Create tiled descriptor
+            auto desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<M>{}, number<K>{})
+            );
+
+            auto desc_tiled = transform_tensor_descriptor(
+                desc,
+                make_tuple(
+                    make_unmerge_transform(make_tuple(number<M0>{}, number<M1>{})),
+                    make_pass_through_transform(number<K>{})
+                ),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0, 1>{}, sequence<2>{})
+            );
+
+            printf("Tiled descriptor: [M, K] -> [M0=%ld, M1=%ld, K=%ld]\n\n",
+                   static_cast<long>(M0), static_cast<long>(M1), static_cast<long>(K));
+
+            // Create coordinate
+            auto coord = make_tensor_coordinate(desc_tiled, make_tuple(1, 8, 16));
+            printf("Initial: [M0=1, M1=8, K=16]\n");
+            printf("  Offset: %ld\n\n", static_cast<long>(coord.get_offset()));
+
+            // Move to next tile in M1 dimension
+            move_tensor_coordinate(desc_tiled, coord, make_tuple(0, 4, 0));
+            printf("After move [0, 4, 0]:\n");
+            printf("  Position: [M0=%ld, M1=%ld, K=%ld]\n",
+                   static_cast<long>(coord.get_index()[number<0>{}]),
+                   static_cast<long>(coord.get_index()[number<1>{}]),
+                   static_cast<long>(coord.get_index()[number<2>{}]));
+            printf("  Offset: %ld\n", static_cast<long>(coord.get_offset()));
+            printf("\nThe move operation efficiently propagates through transformations!\n");
+        }
+
+        printf("\n\n");
+    }
+
+    // Part 4: When to Use Which
+    CK_TILE_DEVICE static void demonstrate_use_cases()
+    {
+        printf("PART 4: When to Use Adaptor vs Descriptor\n");
+        printf("==========================================\n\n");
+
+        printf("Use tensor_adaptor when:\n");
+        printf("  ✓ Designing reusable transformation patterns\n");
+        printf("  ✓ Building intermediate transformation stages\n");
+        printf("  ✓ Composing transformations with chain_tensor_adaptors()\n");
+        printf("  ✓ Memory size is not relevant\n");
+        printf("  ✓ Only need coordinate mapping logic\n\n");
+
+        printf("Use tensor_descriptor when:\n");
+        printf("  ✓ Working with actual tensors that need memory\n");
+        printf("  ✓ Calculating actual memory offsets\n");
+        printf("  ✓ Need to know total memory footprint\n");
+        printf("  ✓ Require vectorization guarantees\n");
+        printf("  ✓ Creating tensor_view for data access\n");
+        printf("  ✓ Working with physical memory buffers\n\n");
+
+        printf("Relationship:\n");
+        printf("  tensor_descriptor IS-A tensor_adaptor (inheritance)\n");
+        printf("  descriptor = adaptor + memory info + vectorization info\n\n");
+
+        printf("Think of it as:\n");
+        printf("  adaptor    = \"The recipe\" (how to transform)\n");
+        printf("  descriptor = \"Recipe + ingredients + kitchen\" (complete spec)\n\n");
+    }
+
+    CK_TILE_DEVICE void operator()() const
+    {
+        if(get_thread_id() != 0) return;
+
+        printf("\n=== TENSOR DESCRIPTOR VS TENSOR ADAPTOR ===\n\n");
+
+        demonstrate_tensor_adaptor();
+        demonstrate_tensor_descriptor();
+        demonstrate_coordinate_operations();
+        demonstrate_use_cases();
+
+        printf("=== KEY TAKEAWAYS ===\n\n");
+        printf("1. tensor_adaptor:\n");
+        printf("   - Pure coordinate transformation logic\n");
+        printf("   - Lightweight, reusable patterns\n");
+        printf("   - No memory or vectorization info\n\n");
+
+        printf("2. tensor_descriptor:\n");
+        printf("   - Complete tensor specification\n");
+        printf("   - Inherits from tensor_adaptor\n");
+        printf("   - Adds memory size and vectorization guarantees\n");
+        printf("   - Can calculate actual memory offsets\n\n");
+
+        printf("3. Coordinate operations:\n");
+        printf("   - make_tensor_coordinate() creates position tracker\n");
+        printf("   - move_tensor_coordinate() efficiently updates position\n");
+        printf("   - Essential for tile window iteration\n\n");
+
+        printf("4. Use the right tool:\n");
+        printf("   - Adaptor for transformation patterns\n");
+        printf("   - Descriptor for actual tensors with memory\n\n");
+    }
+};
+
+int main()
+{
+    std::cout << "\n================================================\n";
+    std::cout << "Tutorial 04: Tensor Descriptor vs Tensor Adaptor\n";
+    std::cout << "================================================\n\n";
+
+    int device_count;
+    hip_check_error(hipGetDeviceCount(&device_count));
+    if(device_count == 0) {
+        std::cerr << "No GPU devices found!\n";
+        return 1;
+    }
+
+    hip_check_error(hipSetDevice(0));
+    hipDeviceProp_t props;
+    hip_check_error(hipGetDeviceProperties(&props, 0));
+    std::cout << "Using GPU: " << props.name << "\n";
+
+    constexpr index_t block_size = DescriptorVsAdaptorKernel<float>::kBlockSize;
+    stream_config stream;
+
+    std::cout << "\nLaunching kernel...\n";
+    std::cout << "=====================================\n";
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     DescriptorVsAdaptorKernel<float>{},
+                     dim3(1),
+                     dim3(block_size),
+                     0));
+
+    hip_check_error(hipDeviceSynchronize());
+    std::cout << "=====================================\n";
+
+    std::cout << "\n=== Tutorial Complete ===\n";
+    std::cout << "You now understand:\n";
+    std::cout << "- The difference between tensor_adaptor and tensor_descriptor\n";
+    std::cout << "- When to use each abstraction\n";
+    std::cout << "- How to create and move coordinates efficiently\n";
+    std::cout << "- The inheritance relationship between them\n";
+    std::cout << "- Practical examples of both in action\n\n";
+
+    std::cout << "See DESCRIPTOR_VS_ADAPTOR.md for detailed documentation.\n\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_05_basic_distributed_gemm/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_05_basic_distributed_gemm/CMakeLists.txt
@@ -0,0 +1,23 @@
+# Tutorial 05: Basic Distributed GEMM
+# Demonstrates basic distributed GEMM with tile distributions
+# Single warp per 16x16 output block
+
+# Create executable for basic distributed GEMM tutorial
+add_executable(aa_tutorial_05_basic_distributed_gemm basic_distributed_gemm.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_05_basic_distributed_gemm PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+# target_compile_options(aa_tutorial_05_basic_distributed_gemm PRIVATE
+
+#     -Wall
+#     -O0
+#     -g
+#     --save-temps
+# )
+
+# Message for build output
+message(STATUS "Added Tutorial 05: Basic Distributed GEMM - Understanding tile distributions for GEMM")
--- a/example/ck_tile/99_toy_tutorial/tutorial_05_basic_distributed_gemm/basic_distributed_gemm.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_05_basic_distributed_gemm/basic_distributed_gemm.cpp
@@ -0,0 +1,457 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Stage 10b: Distributed HGEMM - WORK IN PROGRESS
+ *
+ * This is saved work in progress. The structure is good but tile distributions
+ * need to be fixed to properly match the MLSE example patterns.
+ *
+ * TODO: Fix tile_distribution_encoding for A and B
+ * - Y dimensions should map to vector position
+ * - Need 2x fp16 per register (32 bits)
+ * - P0 = threads, P1 = warps typically
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <chrono>
+#include <limits>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+
+using namespace ck_tile;
+
+// Distributed HGEMM kernel using proper tile_distribution
+template<typename ADataType, typename BDataType, typename CDataType, typename AccDataType>
+struct DistributedHgemmKernel
+{
+    static constexpr index_t kWaveSize = 64;   // AMD wave size
+    static constexpr index_t kBlockM = 16;     // MFMA M dimension
+    static constexpr index_t kBlockN = 16;     // MFMA N dimension
+    static constexpr index_t kBlockK = 16;     // MFMA K dimension per instruction
+
+    // Use ck_tile's WarpGemm for MFMA
+    using WarpGemm = WarpGemmMfmaF16F16F32M16N16K16;
+    static constexpr index_t kBlockSize = kWaveSize;
+
+    CK_TILE_DEVICE void operator()(const ADataType* a,
+                                   const BDataType* b,
+                                   const CDataType* c,
+                                   CDataType* d,
+                                   index_t M,
+                                   index_t N,
+                                   index_t K,
+                                   index_t lda,  // Leading dimension of A (column-major)
+                                   index_t ldb,  // Leading dimension of B (row-major)
+                                   index_t ldc,  // Leading dimension of C (column-major)
+                                   index_t ldd,  // Leading dimension of D (column-major)
+                                   AccDataType alpha,
+                                   AccDataType beta) const
+    {
+        // Calculate which 16×16 block this wave computes
+        // const index_t wave_id = get_block_id() * get_block_size() / kWaveSize + threadIdx.x / kWaveSize;
+        const index_t wave_id = get_warp_id();
+        const index_t wave_m = wave_id / (N / kBlockN);
+        const index_t wave_n = wave_id % (N / kBlockN);
+
+        const index_t m_offset = wave_m * kBlockM;
+        const index_t n_offset = wave_n * kBlockN;
+
+        // Bounds check
+        if(m_offset >= M || n_offset >= N)
+            return;
+
+        // Only threads in the first wave of each block do work (simplified)
+        if(threadIdx.x >= kWaveSize)
+            return;
+
+        // Create tensor views for matrices
+        // A is column-major: M×K with stride lda between columns
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a,
+            make_tuple(M, K),         // Shape: M×K
+            make_tuple(1, lda),       // Strides: column-major
+            number<1>{},
+            number<1>{}
+        );
+
+        // B is row-major: K×N with stride ldb between rows
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b,
+            make_tuple(K, N),         // Shape: K×N
+            make_tuple(ldb, 1),       // Strides: row-major
+            number<4>{},
+            number<1>{}
+        );
+
+        // C is column-major: M×N with stride ldc between columns
+        const auto c_tensor = make_naive_tensor_view<address_space_enum::global>(
+            c,
+            make_tuple(M, N),         // Shape: M×N
+            make_tuple(1, ldc),       // Strides: column-major
+            number<1>{},
+            number<1>{}
+        );
+
+        // D is column-major: M×N with stride ldd between columns
+        auto d_tensor = make_naive_tensor_view<address_space_enum::global>(
+            d,
+            make_tuple(M, N),         // Shape: M×N
+            make_tuple(1, ldd),       // Strides: column-major
+            number<1>{},
+            number<1>{}
+        );
+
+        // Use our tested custom distributions from test_a_distribution.cpp and test_b_distribution.cpp
+        // A: Column-major M×K with each thread loading 4 consecutive K values from one M position
+        constexpr auto a_distribution = make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,                           // No replication
+                tuple<sequence<16>,                  // H0 (M): 16 lanes for M
+                      sequence<4, 4>>,                // H1 (K): 4 lanes × 4 per lane
+                tuple<sequence<2, 1>>,                // P-dims map to H-dims
+                tuple<sequence<0, 0>>,                // P positions in H-dims
+                sequence<2>,                          // Y maps to K dimension only
+                sequence<1>>{}                        // Y at position 1
+        );
+
+        // B: Row-major K×N with each thread loading 4 consecutive K values from one N position
+        constexpr auto b_distribution = make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,                           // No replication
+                tuple<sequence<4, 4>,                // H0 (K): 4 groups of 4 consecutive K values
+                      sequence<16>>,                  // H1 (N): 16 N positions
+                tuple<sequence<1, 2>>,                // P-dims map to H-dims (P0->H1, P1->H0)
+                tuple<sequence<0, 0>>,                // P positions in H-dims
+                sequence<1>,                          // Y maps to K dimension (H0)
+                sequence<1>>{}                        // Y at position 1 in H0 (the second 4 in sequence<4,4>)
+        );
+
+        // Create windows for A and B that we'll move along K
+        auto a_window = make_tile_window(
+            a_tensor,
+            make_tuple(number<kBlockM>{}, number<kBlockK>{}),
+            {m_offset, 0},
+            a_distribution
+        );
+
+        auto b_window = make_tile_window(
+            b_tensor,
+            make_tuple(number<kBlockK>{}, number<kBlockN>{}),
+            {0, n_offset},
+            b_distribution
+        );
+
+        // C distribution (column-major M×N output) - tested in test_c_distribution.cpp
+        constexpr auto c_distribution = make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,                           // No replication
+                tuple<sequence<4, 4>,                // H0 (M): 4 groups of 4 consecutive M values
+                      sequence<16>>,                  // H1 (N): 16 N positions
+                tuple<sequence<1, 2>>,                // P-dims map to H-dims (P0->H1, P1->H0)
+                tuple<sequence<0, 0>>,                // P positions in H-dims
+                sequence<1>,                          // Y maps to M dimension (H0)
+                sequence<1>>{}                        // Y at position 1 in H0 (the second 4 in sequence<4,4>)
+        );
+
+        // Create accumulator using our tested C distribution
+        auto acc_tile = make_static_distributed_tensor<AccDataType>(c_distribution);
+
+        // Initialize accumulator to zero using set_tile
+        set_tile(acc_tile, AccDataType{0});
+
+        // Main K-loop with MFMA accumulation
+        const index_t num_k_loops = K / kBlockK;
+        for(index_t k_iter = 0; k_iter < num_k_loops; ++k_iter)
+        {
+            // Load tiles
+            const auto a_tile = load_tile(a_window);
+            const auto b_tile = load_tile(b_window);
+
+            // Use WarpGemm to perform MFMA
+            // This properly calls the MFMA instruction with the right distributions
+            WarpGemm{}(acc_tile, a_tile, b_tile);
+
+            // Move windows to next K chunk using the move API
+            // This efficiently updates window_origin_ without recreating the window
+            if(k_iter < num_k_loops - 1) {
+                a_window.move({0, kBlockK});  // Move K forward for A
+                b_window.move({kBlockK, 0});  // Move K forward for B
+            }
+        }
+
+        // Scale by alpha using ck_tile's elementwise API
+        // This is more idiomatic than manual buffer manipulation
+        tile_elementwise_inout([alpha](auto& acc_val) { acc_val *= alpha; }, acc_tile);
+
+        // Load C, apply beta, and add to result
+        if(std::abs(beta) > 1e-6f)
+        {
+            auto c_window = make_tile_window(
+                c_tensor,
+                make_tuple(number<kBlockM>{}, number<kBlockN>{}),
+                {m_offset, n_offset},
+                c_distribution
+            );
+
+            const auto c_tile = load_tile(c_window);
+
+            // Apply beta * C + acc using ck_tile's elementwise API
+            // This combines two tiles with a lambda function
+            tile_elementwise_inout(
+                [beta](const auto& c_val, auto& acc_val) {
+                    acc_val += beta * c_val;
+                },
+                c_tile, acc_tile);
+        }
+
+        // Store final result to D
+        auto d_window = make_tile_window(
+            d_tensor,
+            make_tuple(number<kBlockM>{}, number<kBlockN>{}),
+            {m_offset, n_offset},
+            c_distribution
+        );
+
+        store_tile(d_window, acc_tile);
+    }
+};
+
+// CPU reference for verification
+template<typename InType, typename AccType>
+void reference_gemm_mixed(const std::vector<InType>& a,    // Column-major
+                          const std::vector<InType>& b,    // Row-major
+                          const std::vector<AccType>& c,   // Column-major
+                          std::vector<AccType>& d,         // Column-major
+                          index_t M, index_t N, index_t K,
+                          index_t lda, index_t ldb, index_t ldc, index_t ldd,
+                          AccType alpha, AccType beta)
+{
+    // D = alpha * A * B + beta * C
+    for(index_t n = 0; n < N; ++n) {
+        for(index_t m = 0; m < M; ++m) {
+            AccType sum = 0;
+
+            // Compute A * B
+            for(index_t k = 0; k < K; ++k) {
+                // A is column-major: A[m,k] = a[m + k*lda]
+                // B is row-major: B[k,n] = b[k*ldb + n]
+                sum += static_cast<AccType>(a[m + k * lda]) *
+                       static_cast<AccType>(b[k * ldb + n]);
+            }
+
+            // D[m,n] = alpha * sum + beta * C[m,n]
+            // Both C and D are column-major
+            d[m + n * ldd] = alpha * sum + beta * c[m + n * ldc];
+        }
+    }
+}
+
+// Helper to fill matrix with random values
+template<typename T>
+void fill_random(std::vector<T>& data, T min_val = -1, T max_val = 1)
+{
+    for(auto& val : data) {
+        val = static_cast<T>(min_val + (max_val - min_val) *
+                             static_cast<float>(rand()) / RAND_MAX);
+    }
+}
+
+// Helper to print matrix (for debugging)
+template<typename T>
+void print_matrix(const std::vector<T>& mat, index_t rows, index_t cols,
+                  index_t ld, bool col_major = true, const std::string& name = "Matrix")
+{
+    std::cout << name << " (" << rows << "×" << cols << "):\n";
+    for(index_t i = 0; i < std::min(rows, index_t(8)); ++i) {
+        for(index_t j = 0; j < std::min(cols, index_t(8)); ++j) {
+            index_t idx = col_major ? (i + j * ld) : (i * ld + j);
+            std::cout << std::setw(8) << std::setprecision(3) << mat[idx] << " ";
+        }
+        if(cols > 8) std::cout << "...";
+        std::cout << "\n";
+    }
+    if(rows > 8) std::cout << "...\n";
+    std::cout << "\n";
+}
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Stage 10b: Distributed HGEMM with ck_tile\n";
+    std::cout << "==================================================\n\n";
+
+    std::cout << "Key features:\n";
+    std::cout << "• Uses tile_distribution for both A and B matrices\n";
+    std::cout << "• A is column-major, B is row-major (like MLSE example)\n";
+    std::cout << "• Half-precision inputs (fp16) with fp32 accumulation\n";
+    std::cout << "• Non-contiguous loads for A and B\n";
+    std::cout << "• Uses move_tile_window to advance along K dimension\n\n";
+
+    // Test configuration - must be multiples of 16
+    constexpr index_t M = 64;
+    constexpr index_t N = 64;
+    constexpr index_t K = 64;
+
+    // Leading dimensions
+    constexpr index_t lda = M;  // Column-major
+    constexpr index_t ldb = N;  // Row-major
+    constexpr index_t ldc = M;  // Column-major
+    constexpr index_t ldd = M;  // Column-major
+
+    using InputType = half_t;   // fp16
+    using AccumType = float;    // fp32
+
+    constexpr AccumType alpha = 2.0f;
+    constexpr AccumType beta = 1.5f;
+
+    std::cout << "Problem configuration:\n";
+    std::cout << "  M×N×K: " << M << "×" << N << "×" << K << "\n";
+    std::cout << "  A: column-major, lda=" << lda << " (fp16)\n";
+    std::cout << "  B: row-major, ldb=" << ldb << " (fp16)\n";
+    std::cout << "  C/D: column-major, ldc=" << ldc << ", ldd=" << ldd << " (fp32)\n";
+    std::cout << "  alpha=" << alpha << ", beta=" << beta << "\n";
+    std::cout << "  Total FLOPs: " << 2*M*N*K << "\n\n";
+
+    // Host memory
+    std::vector<InputType> h_a(M * K);
+    std::vector<InputType> h_b(K * N);
+    std::vector<AccumType> h_c(M * N);
+    std::vector<AccumType> h_d(M * N, std::numeric_limits<AccumType>::quiet_NaN());
+    std::vector<AccumType> h_d_ref(M * N);
+
+    // Initialize matrices
+    srand(42);
+    fill_random(h_a, InputType(-1), InputType(1));
+    fill_random(h_b, InputType(-1), InputType(1));
+    fill_random(h_c, AccumType(-1), AccumType(1));
+
+    // CPU reference
+    auto cpu_start = std::chrono::high_resolution_clock::now();
+    reference_gemm_mixed(h_a, h_b, h_c, h_d_ref, M, N, K, lda, ldb, ldc, ldd, alpha, beta);
+    auto cpu_end = std::chrono::high_resolution_clock::now();
+    double cpu_time_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
+
+    // Device memory
+    DeviceMem d_a(M * K * sizeof(InputType));
+    DeviceMem d_b(K * N * sizeof(InputType));
+    DeviceMem d_c(M * N * sizeof(AccumType));
+    DeviceMem d_d(M * N * sizeof(AccumType));
+
+    d_a.ToDevice(h_a.data(), M * K * sizeof(InputType));
+    d_b.ToDevice(h_b.data(), K * N * sizeof(InputType));
+    d_c.ToDevice(h_c.data(), M * N * sizeof(AccumType));
+    d_d.ToDevice(h_d.data(), M * N * sizeof(AccumType));
+
+    // Launch kernel
+    constexpr index_t block_size = 64;  // One wave
+    const index_t grid_size = (M / 16) * (N / 16);  // One wave per 16×16 output block
+
+    std::cout << "Launching kernel:\n";
+    std::cout << "  Grid: " << grid_size << " blocks\n";
+    std::cout << "  Block: " << block_size << " threads (1 wave)\n";
+    std::cout << "  Output blocks: " << (M/16) << "×" << (N/16) << " = " << grid_size << "\n";
+    std::cout << "  MFMA instructions per block: " << K/16 << "\n\n";
+
+    stream_config stream;
+
+    // Warmup
+    for(int i = 0; i < 5; ++i) {
+        launch_kernel(stream,
+                     make_kernel<block_size>(
+                         DistributedHgemmKernel<InputType, InputType, AccumType, AccumType>{},
+                         dim3(grid_size),
+                         dim3(block_size),
+                         0,
+                         static_cast<const InputType*>(d_a.GetDeviceBuffer()),
+                         static_cast<const InputType*>(d_b.GetDeviceBuffer()),
+                         static_cast<const AccumType*>(d_c.GetDeviceBuffer()),
+                         static_cast<AccumType*>(d_d.GetDeviceBuffer()),
+                         M, N, K, lda, ldb, ldc, ldd, alpha, beta));
+    }
+    hip_check_error(hipDeviceSynchronize());
+
+    // Timed run
+    auto gpu_start = std::chrono::high_resolution_clock::now();
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     DistributedHgemmKernel<InputType, InputType, AccumType, AccumType>{},
+                     dim3(grid_size),
+                     dim3(block_size),
+                     0,
+                     static_cast<const InputType*>(d_a.GetDeviceBuffer()),
+                     static_cast<const InputType*>(d_b.GetDeviceBuffer()),
+                     static_cast<const AccumType*>(d_c.GetDeviceBuffer()),
+                     static_cast<AccumType*>(d_d.GetDeviceBuffer()),
+                     M, N, K, lda, ldb, ldc, ldd, alpha, beta));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    auto gpu_end = std::chrono::high_resolution_clock::now();
+    double gpu_time_ms = std::chrono::duration<double, std::milli>(gpu_end - gpu_start).count();
+
+    // Get result
+    d_d.FromDevice(h_d.data(), M * N * sizeof(AccumType));
+
+    // Verify
+    bool passed = true;
+    float max_error = 0;
+    index_t error_count = 0;
+
+    for(index_t i = 0; i < M * N; ++i) {
+        float error = std::abs(h_d[i] - h_d_ref[i]);
+        max_error = std::max(max_error, error);
+        if(error > 1e-2f) {  // Relaxed tolerance for fp16
+            if(error_count < 5) {
+                index_t m = i % M;
+                index_t n = i / M;
+                std::cout << "Error at [" << m << "," << n << "]: "
+                          << h_d[i] << " vs " << h_d_ref[i]
+                          << " (diff=" << error << ")\n";
+            }
+            error_count++;
+        }
+    }
+
+    passed = (error_count == 0);
+
+    // Calculate performance
+    double gflops = 2.0 * M * N * K / 1e9;
+    double gpu_tflops = gflops / (gpu_time_ms / 1000);
+    double cpu_gflops = gflops / (cpu_time_ms / 1000);
+
+    std::cout << "Results:\n";
+    std::cout << "  Correctness: " << (passed ? "✓ PASSED" : "✗ FAILED") << "\n";
+    std::cout << "  Max error: " << max_error << "\n";
+    if(!passed) std::cout << "  Error count: " << error_count << "/" << M*N << "\n";
+    std::cout << "\n";
+
+    std::cout << "Performance:\n";
+    std::cout << "  CPU time: " << cpu_time_ms << " ms (" << cpu_gflops << " GFLOPS)\n";
+    std::cout << "  GPU time: " << gpu_time_ms << " ms (" << gpu_tflops << " TFLOPS)\n";
+    std::cout << "  Speedup: " << cpu_time_ms / gpu_time_ms << "x\n\n";
+
+#ifdef DEBUG_OUTPUT
+    // Print sample outputs for debugging
+    print_matrix(h_a, M, K, lda, true, "A (col-major)");
+    print_matrix(h_b, K, N, ldb, false, "B (row-major)");
+    print_matrix(h_c, M, N, ldc, true, "C (col-major)");
+    print_matrix(h_d_ref, M, N, ldd, true, "D_ref (col-major)");
+    print_matrix(h_d, M, N, ldd, true, "D_gpu (col-major)");
+#endif
+
+    std::cout << "=== Key Insights ===\n";
+    std::cout << "• Y dimensions should map to vector position in hierarchical factorization\n";
+    std::cout << "• move_tile_window efficiently advances along K dimension\n";
+    std::cout << "• Column-major A and row-major B require different distributions\n";
+    std::cout << "• Each thread loads 4 elements (2x fp16 = 32 bits per load)\n";
+    std::cout << "• MFMA efficiently computes 16×16×16 in one instruction\n";
+    std::cout << "• This pattern extends to production GEMM kernels\n\n";
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Tutorial 06: Tile Sweeping GEMM
+# Demonstrates tile sweeping with multiple warps
+# Multiple warps cooperate to compute larger output blocks
+
+# Create executable for tile sweeping GEMM tutorial
+add_executable(aa_tutorial_06_tile_sweeping_gemm tile_sweeping_gemm.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_06_tile_sweeping_gemm PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+# target_compile_options(aa_tutorial_06_tile_sweeping_gemm PRIVATE
+#     -Wall
+#     -O0
+#     -g
+#     --save-temps
+# )
+
+# Message for build output
+message(STATUS "Added Tutorial 06: Tile Sweeping GEMM - Multiple warps with tile sweeping pattern")
+
+# Add test subdirectory
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tests/CMakeLists.txt)
+    add_subdirectory(tests)
+endif()
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Test programs for Tutorial 06 tile distributions
+
+# Test A distribution with replication
+add_executable(test_a_dist_replication test_a_distribution_with_replication.cpp)
+target_include_directories(test_a_dist_replication PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+# target_compile_options(test_a_dist_replication PRIVATE -Wall -O0 -g)
+
+# Test B distribution with replication
+add_executable(test_b_dist_replication test_b_distribution_with_replication.cpp)
+target_include_directories(test_b_dist_replication PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+# target_compile_options(test_b_dist_replication PRIVATE -Wall -O0 -g)
+
+# Test A distribution using embed API
+add_executable(test_a_embed test_a_embed_distribution.cpp)
+target_include_directories(test_a_embed PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+# target_compile_options(test_a_embed PRIVATE -Wall -O0 -g)
+
+# Test B distribution using embed API
+add_executable(test_b_embed test_b_embed_distribution.cpp)
+target_include_directories(test_b_embed PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+# target_compile_options(test_b_embed PRIVATE -Wall -O0 -g)
+
+message(STATUS "Added Tutorial 06 distribution tests (with and without embed API)")
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_a_distribution_with_replication.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_a_distribution_with_replication.cpp
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test A matrix distribution with NWarp replication
+ *
+ * Goal: Load a 16x16 A matrix with 256 threads (4 warps in 2x2 config)
+ * - A is replicated across NWarp (2 N-warps)
+ * - Each M-warp (2 total) loads different M-rows
+ * - Each thread loads 4 fp16 elements
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestADistributionKernel
+{
+    static constexpr index_t kBlockSize = 256;  // 4 warps
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t kM = 16;
+    static constexpr index_t kK = 16;
+
+    CK_TILE_DEVICE void operator()(const DataType* a,
+                                   DataType* debug_output,
+                                   index_t lda) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for A (column-major)
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a,
+            make_tuple(kM, kK),
+            make_tuple(1, lda),
+            number<1>{},
+            number<1>{}
+        );
+
+
+        // A distribution WITH NWarp replication and MWarp in H-dimension
+        // Based on 02_gemm pattern: include MWarp in H-tuple
+        // R: NWarp replication
+        // H0: MWarp × 16 threads = 2×16 = 32 M positions  
+        // H1: 4×4 = 16 K elements
+        constexpr auto a_distribution = make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<NWarp>,                      // R: REPLICATE across 2 N-warps
+                tuple<sequence<MWarp, 16>,            // H0 (M): 2 M-warps × 16 threads = 32 M
+                      sequence<4, 4>>,                // H1 (K): 4×4 = 16 K elements
+                tuple<sequence<0, 1>, sequence<2, 1>>,  // Ps_to_Hs: P0→(R,M), P1→(M,K)
+                tuple<sequence<0, 0>, sequence<0, 1>>,  // Ps_in_Hs: positions
+                sequence<2>,                          // Ys_to_Hs: Y maps to K (dimension 2)
+                sequence<1>>{}                        // Ys_in_Hs: Y at position 1 in K
+        );
+
+        auto a_window = make_tile_window(
+            a_tensor,
+            make_tuple(number<kM>{}, number<kK>{}),
+            {0, 0},
+            a_distribution
+        );
+
+        const auto a_tile = load_tile(a_window);
+        const auto& thread_buffer = a_tile.get_thread_buffer();
+
+        // Calculate matrix coordinates using make_tensor_coordinate
+        // This shows which matrix positions each thread accesses
+        
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== Matrix Coverage (Tiled by Warp) ===\n");
+            printf("Distribution covers 32×16 matrix (MWarp×16 threads × K)\n");
+            printf("With NWarp=2 replication, pattern repeats\n");
+            printf("Showing first 16 threads of each warp:\n\n");
+        }
+        
+        __syncthreads();
+        
+        // Print warp by warp with calculated coordinates
+        for(int w = 0; w < 4; ++w) {
+            __syncthreads();
+            
+            if(warp_id == w && lane_id == 0) {
+                printf("\n--- Warp %d (M-warp %d, N-warp %d) ---\n", 
+                       w, w/NWarp, w%NWarp);
+            }
+            
+            __syncthreads();
+            
+            // Print lanes sequentially within each warp
+            for(int lane = 0; lane < 16; ++lane) {
+                __syncthreads();
+                
+                if(warp_id == w && lane_id == lane) {
+                    printf("W%d L%02d: ", w, lane);
+                    
+                    // For each Y element, just print the loaded value
+                    // The distribution handles the coordinate mapping internally
+                    for(int y_idx = 0; y_idx < thread_buffer.size(); ++y_idx) {
+                        float val = static_cast<float>(thread_buffer[y_idx]);
+                        int m = static_cast<int>(val) % 100;
+                        int k = static_cast<int>(val) / 100;
+                        
+                        printf("A[%2d,%2d] ", m, k);
+                    }
+                    printf("\n");
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Expected Pattern with NWarp Replication ===\n");
+            printf("sequence<NWarp> replicates across N-warp dimension:\n");
+            printf("Warp 0 (M-warp 0, N-warp 0): Loads some M-rows, K[0-15]\n");
+            printf("Warp 1 (M-warp 0, N-warp 1): Loads DIFFERENT M-rows (different N-warp)\n");
+            printf("Warp 2 (M-warp 1, N-warp 0): Loads SAME as Warp 0 (NWarp replication!)\n");
+            printf("Warp 3 (M-warp 1, N-warp 1): Loads SAME as Warp 1 (NWarp replication!)\n");
+            printf("\nReplication pairs:\n");
+            printf("  Warps 0 & 2 should be identical (same N-warp 0, replicated across M-warps)\n");
+            printf("  Warps 1 & 3 should be identical (same N-warp 1, replicated across M-warps)\n");
+            printf("  Warps 0 & 1 should be DIFFERENT (different N-warps)\n");
+        }
+
+        // Store for verification
+        for(int i = 0; i < thread_buffer.size(); ++i) {
+            debug_output[tid * 4 + i] = thread_buffer[i];
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test A Distribution with NWarp Replication\n";
+    std::cout << "==================================================\n\n";
+
+    constexpr index_t M = 16;
+    constexpr index_t K = 16;
+    constexpr index_t lda = M;
+
+    using DataType = half_t;
+
+    // Create test matrix
+    std::vector<DataType> h_a(M * K);
+    std::vector<DataType> h_debug(256 * 4, -1);
+
+    // Initialize A[m,k] = m + k*100
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t m = 0; m < M; ++m) {
+            h_a[m + k * lda] = static_cast<DataType>(m + k * 100);
+        }
+    }
+
+    DeviceMem d_a(M * K * sizeof(DataType));
+    DeviceMem d_debug(256 * 4 * sizeof(DataType));
+
+    d_a.ToDevice(h_a.data(), M * K * sizeof(DataType));
+    d_debug.ToDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestADistributionKernel<DataType>{},
+                     dim3(1),
+                     dim3(256),
+                     0,
+                     static_cast<const DataType*>(d_a.GetDeviceBuffer()),
+                     static_cast<DataType*>(d_debug.GetDeviceBuffer()),
+                     lda));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    d_debug.FromDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    // Verify NWarp replication: warps 0&2 identical, warps 1&3 identical
+    bool passed = true;
+    
+    // Check warps 0 and 2 (same N-warp 0, replicated across M-warps)
+    for(int lane = 0; lane < 64; ++lane) {
+        for(int i = 0; i < 4; ++i) {
+            float warp0_val = h_debug[lane * 4 + i];
+            float warp2_val = h_debug[(128 + lane) * 4 + i];
+            if(std::abs(warp0_val - warp2_val) > 0.01f) {
+                std::cout << "ERROR: Warp 0 and Warp 2 differ at lane " << lane << " element " << i << "\n";
+                std::cout << "  Warp 0: " << warp0_val << ", Warp 2: " << warp2_val << "\n";
+                passed = false;
+                break;
+            }
+        }
+        if(!passed) break;
+    }
+
+    // Check warps 1 and 3 (same N-warp 1, replicated across M-warps)
+    if(passed) {
+        for(int lane = 0; lane < 64; ++lane) {
+            for(int i = 0; i < 4; ++i) {
+                float warp1_val = h_debug[(64 + lane) * 4 + i];
+                float warp3_val = h_debug[(192 + lane) * 4 + i];
+                if(std::abs(warp1_val - warp3_val) > 0.01f) {
+                    std::cout << "ERROR: Warp 1 and Warp 3 differ at lane " << lane << " element " << i << "\n";
+                    std::cout << "  Warp 1: " << warp1_val << ", Warp 3: " << warp3_val << "\n";
+                    passed = false;
+                    break;
+                }
+            }
+            if(!passed) break;
+        }
+    }
+
+    if(passed) {
+        std::cout << "\n✓ NWarp Replication verified:\n";
+        std::cout << "  Warps 0 & 2 load identical data (N-warp 0, replicated across M-warps)\n";
+        std::cout << "  Warps 1 & 3 load identical data (N-warp 1, replicated across M-warps)\n";
+    }
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_a_embed_distribution.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_a_embed_distribution.cpp
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test A matrix distribution using EMBED API
+ *
+ * Goal: Load a 16x16 A matrix with 256 threads (4 warps in 2x2 config)
+ * Uses detail::make_embed_tile_distribution_encoding to separate:
+ * - Block-level: Warp organization with replication
+ * - Warp-level: Thread organization within each warp
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestADistributionKernel
+{
+    static constexpr index_t kBlockSize = 256;  // 4 warps
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t kM = 16;
+    static constexpr index_t kK = 16;
+
+    CK_TILE_DEVICE void operator()(const DataType* a,
+                                   DataType* debug_output,
+                                   index_t lda) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for A (column-major)
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a,
+            make_tuple(kM, kK),
+            make_tuple(1, lda),
+            number<1>{},
+            number<1>{}
+        );
+
+
+        // A distribution using EMBED API (like 02_gemm)
+        // Separate block-level and warp-level distributions
+
+        // constexpr auto a_distribution = make_static_tile_distribution(
+        //     tile_distribution_encoding<
+        //         sequence<NWarp>,                      // R: REPLICATE across 2 N-warps
+        //         tuple<sequence<MWarp, 16>,            // H0 (M): 2 M-warps × 16 threads = 32 M
+        //               sequence<4, 4>>,                // H1 (K): 4×4 = 16 K elements
+        //         tuple<sequence<0, 1>, sequence<2, 1>>,  // Ps_to_Hs: P0→(R,M), P1→(M,K)
+        //         tuple<sequence<0, 0>, sequence<0, 1>>,  // Ps_in_Hs: positions
+        //         sequence<2>,                          // Ys_to_Hs: Y maps to K (dimension 2)
+        //         sequence<1>>{}                        // Ys_in_Hs: Y at position 1 in K
+        // );
+
+        
+        // Step 1: Warp-level distribution (64 threads within one warp)
+        constexpr auto a_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,                           // No replication at warp level
+            tuple<sequence<16>,                   // H0 (M): 16 M positions
+                  sequence<4, 4>>,                // H1 (K): 4×4 = 16 K elements
+            tuple<sequence<2, 1>>,                // Ps_to_Hs: 2D P-space (64 threads)
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<2>,                          // Ys_to_Hs: Y maps to K
+            sequence<1>>{};                       // Ys_in_Hs
+        
+        // Step 2: Block-level outer distribution (warp organization)
+        // Must have same NDimX as inner (2 dimensions: M and K)
+        constexpr auto a_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<NWarp>,                      // R: Replicate across N-warps
+            tuple<sequence<MWarp>, sequence<>>,  // H: MWarp in M-dim, 1 in K-dim
+            tuple<sequence<0, 1>>,                // Ps_to_Hs
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<>,                       // Ys_to_Hs: Y maps to both M and K
+            sequence<>>{};                    // Ys_in_Hs
+        
+        // Step 3: Embed warp-level into block-level
+        constexpr auto a_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encode, a_warp_dstr_encode);
+        
+        // Step 4: Create final distribution
+        constexpr auto a_distribution = make_static_tile_distribution(a_block_dstr_encode);
+
+        auto a_window = make_tile_window(
+            a_tensor,
+            make_tuple(number<kM>{}, number<kK>{}),
+            {0, 0},
+            a_distribution
+        );
+
+        const auto a_tile = load_tile(a_window);
+        const auto& thread_buffer = a_tile.get_thread_buffer();
+
+        // Calculate matrix coordinates using make_tensor_coordinate
+        // This shows which matrix positions each thread accesses
+        
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== Matrix Coverage (Tiled by Warp) ===\n");
+            printf("Distribution covers 32×16 matrix (MWarp×16 threads × K)\n");
+            printf("With NWarp=2 replication, pattern repeats\n");
+            printf("Showing first 16 threads of each warp:\n\n");
+        }
+        
+        __syncthreads();
+        
+        // Print warp by warp with calculated coordinates
+        for(int w = 0; w < 4; ++w) {
+            __syncthreads();
+            
+            if(warp_id == w && lane_id == 0) {
+                printf("\n--- Warp %d (M-warp %d, N-warp %d) ---\n", 
+                       w, w/NWarp, w%NWarp);
+            }
+            
+            __syncthreads();
+            
+            // Print lanes sequentially within each warp
+            for(int lane = 0; lane < 16; ++lane) {
+                __syncthreads();
+                
+                if(warp_id == w && lane_id == lane) {
+                    printf("W%d L%02d: ", w, lane);
+                    
+                    // For each Y element, just print the loaded value
+                    // The distribution handles the coordinate mapping internally
+                    for(int y_idx = 0; y_idx < thread_buffer.size(); ++y_idx) {
+                        float val = static_cast<float>(thread_buffer[y_idx]);
+                        int m = static_cast<int>(val) % 100;
+                        int k = static_cast<int>(val) / 100;
+                        
+                        printf("A[%2d,%2d] ", m, k);
+                    }
+                    printf("\n");
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Expected Pattern with NWarp Replication ===\n");
+            printf("sequence<NWarp> replicates across N-warp dimension:\n");
+            printf("Warp 0 (M-warp 0, N-warp 0): Loads some M-rows, K[0-15]\n");
+            printf("Warp 1 (M-warp 0, N-warp 1): Loads DIFFERENT M-rows (different N-warp)\n");
+            printf("Warp 2 (M-warp 1, N-warp 0): Loads SAME as Warp 0 (NWarp replication!)\n");
+            printf("Warp 3 (M-warp 1, N-warp 1): Loads SAME as Warp 1 (NWarp replication!)\n");
+            printf("\nReplication pairs:\n");
+            printf("  Warps 0 & 2 should be identical (same N-warp 0, replicated across M-warps)\n");
+            printf("  Warps 1 & 3 should be identical (same N-warp 1, replicated across M-warps)\n");
+            printf("  Warps 0 & 1 should be DIFFERENT (different N-warps)\n");
+        }
+
+        // Store for verification
+        for(int i = 0; i < thread_buffer.size(); ++i) {
+            debug_output[tid * 4 + i] = thread_buffer[i];
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test A Distribution using EMBED API\n";
+    std::cout << "==================================================\n\n";
+    std::cout << "Separates block-level (warp organization) from warp-level (thread organization)\n\n";
+
+    constexpr index_t M = 16;
+    constexpr index_t K = 16;
+    constexpr index_t lda = M;
+
+    using DataType = half_t;
+
+    // Create test matrix
+    std::vector<DataType> h_a(M * K);
+    std::vector<DataType> h_debug(256 * 4, -1);
+
+    // Initialize A[m,k] = m + k*100
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t m = 0; m < M; ++m) {
+            h_a[m + k * lda] = static_cast<DataType>(m + k * 100);
+        }
+    }
+
+    DeviceMem d_a(M * K * sizeof(DataType));
+    DeviceMem d_debug(256 * 4 * sizeof(DataType));
+
+    d_a.ToDevice(h_a.data(), M * K * sizeof(DataType));
+    d_debug.ToDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestADistributionKernel<DataType>{},
+                     dim3(1),
+                     dim3(256),
+                     0,
+                     static_cast<const DataType*>(d_a.GetDeviceBuffer()),
+                     static_cast<DataType*>(d_debug.GetDeviceBuffer()),
+                     lda));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    d_debug.FromDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    // Verify NWarp replication: warps 0&2 identical, warps 1&3 identical
+    bool passed = true;
+    
+    // Check warps 0 and 2 (same N-warp 0, replicated across M-warps)
+    for(int lane = 0; lane < 64; ++lane) {
+        for(int i = 0; i < 4; ++i) {
+            float warp0_val = h_debug[lane * 4 + i];
+            float warp2_val = h_debug[(128 + lane) * 4 + i];
+            if(std::abs(warp0_val - warp2_val) > 0.01f) {
+                std::cout << "ERROR: Warp 0 and Warp 2 differ at lane " << lane << " element " << i << "\n";
+                std::cout << "  Warp 0: " << warp0_val << ", Warp 2: " << warp2_val << "\n";
+                passed = false;
+                break;
+            }
+        }
+        if(!passed) break;
+    }
+
+    // Check warps 1 and 3 (same N-warp 1, replicated across M-warps)
+    if(passed) {
+        for(int lane = 0; lane < 64; ++lane) {
+            for(int i = 0; i < 4; ++i) {
+                float warp1_val = h_debug[(64 + lane) * 4 + i];
+                float warp3_val = h_debug[(192 + lane) * 4 + i];
+                if(std::abs(warp1_val - warp3_val) > 0.01f) {
+                    std::cout << "ERROR: Warp 1 and Warp 3 differ at lane " << lane << " element " << i << "\n";
+                    std::cout << "  Warp 1: " << warp1_val << ", Warp 3: " << warp3_val << "\n";
+                    passed = false;
+                    break;
+                }
+            }
+            if(!passed) break;
+        }
+    }
+
+    if(passed) {
+        std::cout << "\n✓ NWarp Replication verified:\n";
+        std::cout << "  Warps 0 & 2 load identical data (N-warp 0, replicated across M-warps)\n";
+        std::cout << "  Warps 1 & 3 load identical data (N-warp 1, replicated across M-warps)\n";
+    }
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_b_distribution_with_replication.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_b_distribution_with_replication.cpp
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test B matrix distribution with MWarp replication
+ *
+ * Goal: Load a 16x16 B matrix with 256 threads (4 warps in 2x2 config)
+ * - B is replicated across MWarp (2 M-warps)
+ * - Each N-warp (2 total) loads different N-columns
+ * - Each thread loads 4 fp16 elements
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestBDistributionKernel
+{
+    static constexpr index_t kBlockSize = 256;  // 4 warps
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t kK = 16;
+    static constexpr index_t kN = 16;
+
+    CK_TILE_DEVICE void operator()(const DataType* b,
+                                   DataType* debug_output,
+                                   index_t ldb) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for B (row-major)
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b,
+            make_tuple(kK, kN),
+            make_tuple(ldb, 1),
+            number<4>{},
+            number<1>{}
+        );
+
+        // B distribution WITH MWarp replication
+        // R dimension (index 0) has MWarp=2 replicas
+        // H dimensions: H0 (K), H1 (N)
+        // P-space needs to map to BOTH R and H dimensions
+        // Total threads: 256 = MWarp(2) × NWarp(2) × 64
+        // But with replication, P-space is: MWarp(2) × [NWarp(2) × 64]
+
+        // constexpr auto a_distribution = make_static_tile_distribution(
+        //     tile_distribution_encoding<
+        //         sequence<NWarp>,                      // R: REPLICATE across 2 N-warps
+        //         tuple<sequence<MWarp, 16>,            // H0 (M): 2 M-warps × 16 threads = 32 M
+        //               sequence<4, 4>>,                // H1 (K): 4×4 = 16 K elements
+        //         tuple<sequence<0, 1>, sequence<1, 2>>,  // Ps_to_Hs: P0→(R,M), P1→(M,K)
+        //         tuple<sequence<0, 0>, sequence<1, 0>>,  // Ps_in_Hs: positions
+        //         sequence<2>,                          // Ys_to_Hs: Y maps to K (dimension 2)
+        //         sequence<1>>{}                        // Ys_in_Hs: Y at position 1 in K
+        // );
+
+
+        // constexpr auto b_distribution = make_static_tile_distribution(
+        //     tile_distribution_encoding<
+        //         sequence<>,                           // No replication
+        //         tuple<sequence<4, 4>,                // H0 (K): 4 groups of 4 consecutive K values
+        //               sequence<16>>,                  // H1 (N): 16 N positions
+        //         tuple<sequence<1, 2>>,                // P-dims map to H-dims (P0->H1, P1->H0)
+        //         tuple<sequence<0, 0>>,                // P positions in H-dims
+        //         sequence<1>,                          // Y maps to K dimension (H0)
+        //         sequence<1>>{}                        // Y at position 1 in H0 (the second 4 in sequence<4,4>)
+        // );
+
+        // constexpr auto b_block_outer_dstr_encoding =
+        //     tile_distribution_encoding<sequence<MWarp>,
+        //                                tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+        //                                tuple<sequence<0, 1>>,
+        //                                tuple<sequence<0, 1>>,
+        //                                sequence<1, 2>,
+        //                                sequence<0, 0>>{};
+        
+        // The key: Ps_to_Hs must include dimension 0 (the R dimension)!
+        constexpr auto b_distribution = make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<MWarp>,                      // R: dimension 0, REPLICATE across 2 M-warps
+                tuple<sequence<4, 4>,                 // H: dimension 1 (K): 4×4 = 16 K elements
+                      sequence<2, 16>>,                  // H: dimension 2 (N): 16 N positions
+                tuple<sequence<2, 0>, sequence<1, 2>>,  // Ps_to_Hs: P0→R(dim 0), P1→K(dim 1), P2→N(dim 2)
+                tuple<sequence<0, 0>, sequence<0, 1>>,  // Ps_in_Hs: positions
+                sequence<1>,                          // Ys_to_Hs: Y maps to K (dimension 1)
+                sequence<1>>{}                        // Ys_in_Hs: Y at position 1 in K
+        );
+
+        auto b_window = make_tile_window(
+            b_tensor,
+            make_tuple(number<kK>{}, number<kN>{}),
+            {0, 0},
+            b_distribution
+        );
+
+        const auto b_tile = load_tile(b_window);
+        const auto& thread_buffer = b_tile.get_thread_buffer();
+
+        // Sequential printing with synchronizations (copied from test_a)
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== Matrix Coverage (Tiled by Warp) ===\n");
+            printf("Distribution covers K×32 matrix (K × NWarp×16 threads)\n");
+            printf("With MWarp=2 replication, pattern repeats\n");
+            printf("Showing first 16 threads of each warp:\n\n");
+        }
+        
+        __syncthreads();
+        
+        // Print warp by warp with calculated coordinates
+        for(int w = 0; w < 4; ++w) {
+            __syncthreads();
+            
+            if(warp_id == w && lane_id == 0) {
+                printf("\n--- Warp %d (M-warp %d, N-warp %d) ---\n", 
+                       w, w/NWarp, w%NWarp);
+            }
+            
+            __syncthreads();
+            
+            // Print lanes sequentially within each warp
+            for(int lane = 0; lane < 64; ++lane) {
+                __syncthreads();
+                
+                if(warp_id == w && lane_id == lane) {
+                    printf("W%d L%02d: ", w, lane);
+                    
+                    // Print loaded values
+                    for(int y_idx = 0; y_idx < thread_buffer.size(); ++y_idx) {
+                        float val = static_cast<float>(thread_buffer[y_idx]);
+                        int k = static_cast<int>(val) % 100;
+                        int n = static_cast<int>(val) / 100;
+                        
+                        printf("B[%2d,%2d] ", k, n);
+                    }
+                    printf("\n");
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Observed Pattern ===\n");
+            printf("Warps 0 & 1 are identical\n");
+            printf("Warps 2 & 3 are identical\n");
+            printf("Warps 0 & 2 are different\n");
+        }
+
+        // Store for verification
+        for(int i = 0; i < thread_buffer.size(); ++i) {
+            debug_output[tid * 4 + i] = thread_buffer[i];
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test B Distribution with MWarp Replication\n";
+    std::cout << "==================================================\n\n";
+
+    constexpr index_t K = 16;
+    constexpr index_t N = 32;
+    constexpr index_t ldb = N;
+
+    using DataType = half_t;
+
+    // Create test matrix
+    std::vector<DataType> h_b(K * N);
+    std::vector<DataType> h_debug(256 * 4, -1);
+
+    // Initialize B[k,n] = k + n*100 (row-major)
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t n = 0; n < N; ++n) {
+            h_b[k * ldb + n] = static_cast<DataType>(k + n * 100);
+        }
+    }
+
+    DeviceMem d_b(K * N * sizeof(DataType));
+    DeviceMem d_debug(256 * 4 * sizeof(DataType));
+
+    d_b.ToDevice(h_b.data(), K * N * sizeof(DataType));
+    d_debug.ToDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestBDistributionKernel<DataType>{},
+                     dim3(1),
+                     dim3(256),
+                     0,
+                     static_cast<const DataType*>(d_b.GetDeviceBuffer()),
+                     static_cast<DataType*>(d_debug.GetDeviceBuffer()),
+                     ldb));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    d_debug.FromDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    // Verify: Based on your observation, warps 0&1 identical, warps 2&3 identical
+    bool passed = true;
+    
+    // Check warps 0 and 1
+    for(int lane = 0; lane < 64; ++lane) {
+        for(int i = 0; i < 4; ++i) {
+            float warp0_val = h_debug[lane * 4 + i];
+            float warp1_val = h_debug[(64 + lane) * 4 + i];
+            if(std::abs(warp0_val - warp1_val) > 0.01f) {
+                std::cout << "ERROR: Warp 0 and Warp 1 differ at lane " << lane << " element " << i << "\n";
+                std::cout << "  Warp 0: " << warp0_val << ", Warp 1: " << warp1_val << "\n";
+                passed = false;
+                break;
+            }
+        }
+        if(!passed) break;
+    }
+
+    // Check warps 2 and 3
+    if(passed) {
+        for(int lane = 0; lane < 64; ++lane) {
+            for(int i = 0; i < 4; ++i) {
+                float warp2_val = h_debug[(128 + lane) * 4 + i];
+                float warp3_val = h_debug[(192 + lane) * 4 + i];
+                if(std::abs(warp2_val - warp3_val) > 0.01f) {
+                    std::cout << "ERROR: Warp 2 and Warp 3 differ at lane " << lane << " element " << i << "\n";
+                    std::cout << "  Warp 2: " << warp2_val << ", Warp 3: " << warp3_val << "\n";
+                    passed = false;
+                    break;
+                }
+            }
+            if(!passed) break;
+        }
+    }
+
+    if(passed) {
+        std::cout << "\n✓ Replication verified:\n";
+        std::cout << "  Warps 0 & 1 load identical data\n";
+        std::cout << "  Warps 2 & 3 load identical data\n";
+    }
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_b_embed_distribution.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tests/test_b_embed_distribution.cpp
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test B matrix distribution using EMBED API
+ *
+ * Goal: Load a 16x16 B matrix with 256 threads (4 warps in 2x2 config)
+ * Uses detail::make_embed_tile_distribution_encoding to separate:
+ * - Block-level: Warp organization with replication
+ * - Warp-level: Thread organization within each warp
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestBDistributionKernel
+{
+    static constexpr index_t kBlockSize = 256;  // 4 warps
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t kK = 16;
+    static constexpr index_t kN = 16;
+
+    CK_TILE_DEVICE void operator()(const DataType* b,
+                                   DataType* debug_output,
+                                   index_t ldb) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for B (row-major)
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b,
+            make_tuple(kK, kN),
+            make_tuple(ldb, 1),
+            number<4>{},
+            number<1>{}
+        );
+
+        // B distribution using EMBED API (like 02_gemm)
+        // Separate block-level and warp-level distributions
+        
+        // Step 1: Warp-level distribution (64 threads within one warp)
+        constexpr auto b_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,                           // No replication at warp level
+            tuple<sequence<4, 4>,                 // H0 (K): 4×4 = 16 K elements
+                  sequence<16>>,                  // H1 (N): 16 N positions
+            tuple<sequence<1, 2>>,                // Ps_to_Hs: 1 sequence with 2 values (2D P-space)
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<1>,                          // Ys_to_Hs: Y maps to K
+            sequence<1>>{};                       // Ys_in_Hs
+        
+        // Step 2: Block-level outer distribution (warp organization)
+        // Must have same NDimX as inner (2 dimensions: K and N)
+        constexpr auto b_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<MWarp>,                      // R: Replicate across M-warps
+            tuple<sequence<>, sequence<NWarp>>,  // H: NWarp in N-dim, 1 in K-dim
+            tuple<sequence<2, 0>>,                // Ps_to_Hs
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<>,                       // Ys_to_Hs: Y maps to both K and N
+            sequence<>>{};                    // Ys_in_Hs
+
+        // constexpr auto b_distribution = make_static_tile_distribution(
+        //     tile_distribution_encoding<
+        //         sequence<MWarp>,                      // R: dimension 0, REPLICATE across 2 M-warps
+        //         tuple<sequence<4, 4>,                 // H: dimension 1 (K): 4×4 = 16 K elements
+        //               sequence<2, 16>>,                  // H: dimension 2 (N): 16 N positions
+        //         tuple<sequence<2, 0>, sequence<1, 2>>,  // Ps_to_Hs: P0→R(dim 0), P1→K(dim 1), P2→N(dim 2)
+        //         tuple<sequence<0, 0>, sequence<0, 1>>,  // Ps_in_Hs: positions
+        //         sequence<1>,                          // Ys_to_Hs: Y maps to K (dimension 1)
+        //         sequence<1>>{}                        // Ys_in_Hs: Y at position 1 in K
+        // );
+        
+        // Step 3: Embed warp-level into block-level
+        constexpr auto b_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encode, b_warp_dstr_encode);
+        
+        // Step 4: Create final distribution
+        constexpr auto b_distribution = make_static_tile_distribution(b_block_dstr_encode);
+
+        auto b_window = make_tile_window(
+            b_tensor,
+            make_tuple(number<kK>{}, number<kN>{}),
+            {0, 0},
+            b_distribution
+        );
+
+        const auto b_tile = load_tile(b_window);
+        const auto& thread_buffer = b_tile.get_thread_buffer();
+
+        // Sequential printing with synchronizations (copied from test_a)
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== Matrix Coverage (Tiled by Warp) ===\n");
+            printf("Distribution covers K×32 matrix (K × NWarp×16 threads)\n");
+            printf("With MWarp=2 replication, pattern repeats\n");
+            printf("Showing first 16 threads of each warp:\n\n");
+        }
+        
+        __syncthreads();
+        
+        // Print warp by warp with calculated coordinates
+        for(int w = 0; w < 4; ++w) {
+            __syncthreads();
+            
+            if(warp_id == w && lane_id == 0) {
+                printf("\n--- Warp %d (M-warp %d, N-warp %d) ---\n", 
+                       w, w/NWarp, w%NWarp);
+            }
+            
+            __syncthreads();
+            
+            // Print lanes sequentially within each warp
+            for(int lane = 0; lane < 16; ++lane) {
+                __syncthreads();
+                
+                if(warp_id == w && lane_id == lane) {
+                    printf("W%d L%02d: ", w, lane);
+                    
+                    // Print loaded values
+                    for(int y_idx = 0; y_idx < thread_buffer.size(); ++y_idx) {
+                        float val = static_cast<float>(thread_buffer[y_idx]);
+                        int k = static_cast<int>(val) % 100;
+                        int n = static_cast<int>(val) / 100;
+                        
+                        printf("B[%2d,%2d] ", k, n);
+                    }
+                    printf("\n");
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Observed Pattern ===\n");
+            printf("Warps 0 & 1 are identical\n");
+            printf("Warps 2 & 3 are identical\n");
+            printf("Warps 0 & 2 are different\n");
+        }
+
+        // Store for verification
+        for(int i = 0; i < thread_buffer.size(); ++i) {
+            debug_output[tid * 4 + i] = thread_buffer[i];
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test B Distribution using EMBED API\n";
+    std::cout << "==================================================\n\n";
+    std::cout << "Separates block-level (warp organization) from warp-level (thread organization)\n\n";
+
+    constexpr index_t K = 32;
+    constexpr index_t N = 32;
+    constexpr index_t ldb = N;
+
+    using DataType = half_t;
+
+    // Create test matrix
+    std::vector<DataType> h_b(K * N);
+    std::vector<DataType> h_debug(256 * 4, -1);
+
+    // Initialize B[k,n] = k + n*100 (row-major)
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t n = 0; n < N; ++n) {
+            h_b[k * ldb + n] = static_cast<DataType>(k + n * 100);
+        }
+    }
+
+    DeviceMem d_b(K * N * sizeof(DataType));
+    DeviceMem d_debug(256 * 4 * sizeof(DataType));
+
+    d_b.ToDevice(h_b.data(), K * N * sizeof(DataType));
+    d_debug.ToDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestBDistributionKernel<DataType>{},
+                     dim3(1),
+                     dim3(256),
+                     0,
+                     static_cast<const DataType*>(d_b.GetDeviceBuffer()),
+                     static_cast<DataType*>(d_debug.GetDeviceBuffer()),
+                     ldb));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    d_debug.FromDevice(h_debug.data(), 256 * 4 * sizeof(DataType));
+
+    // Verify: Based on your observation, warps 0&1 identical, warps 2&3 identical
+    bool passed = true;
+    
+    // Check warps 0 and 1
+    for(int lane = 0; lane < 64; ++lane) {
+        for(int i = 0; i < 4; ++i) {
+            float warp0_val = h_debug[lane * 4 + i];
+            float warp1_val = h_debug[(64 + lane) * 4 + i];
+            if(std::abs(warp0_val - warp1_val) > 0.01f) {
+                std::cout << "ERROR: Warp 0 and Warp 1 differ at lane " << lane << " element " << i << "\n";
+                std::cout << "  Warp 0: " << warp0_val << ", Warp 1: " << warp1_val << "\n";
+                passed = false;
+                break;
+            }
+        }
+        if(!passed) break;
+    }
+
+    // Check warps 2 and 3
+    if(passed) {
+        for(int lane = 0; lane < 64; ++lane) {
+            for(int i = 0; i < 4; ++i) {
+                float warp2_val = h_debug[(128 + lane) * 4 + i];
+                float warp3_val = h_debug[(192 + lane) * 4 + i];
+                if(std::abs(warp2_val - warp3_val) > 0.01f) {
+                    std::cout << "ERROR: Warp 2 and Warp 3 differ at lane " << lane << " element " << i << "\n";
+                    std::cout << "  Warp 2: " << warp2_val << ", Warp 3: " << warp3_val << "\n";
+                    passed = false;
+                    break;
+                }
+            }
+            if(!passed) break;
+        }
+    }
+
+    if(passed) {
+        std::cout << "\n✓ Replication verified:\n";
+        std::cout << "  Warps 0 & 1 load identical data\n";
+        std::cout << "  Warps 2 & 3 load identical data\n";
+    }
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tile_sweeping_gemm.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_06_tile_sweeping_gemm/tile_sweeping_gemm.cpp
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Tutorial 06: Tile Sweeping GEMM
+ *
+ * Demonstrates tile sweeping pattern with multiple warps cooperating
+ * to compute larger output blocks. This tutorial shows how warps sweep
+ * over multiple tiles using static_for loops and move_tile_window.
+ *
+ * Key concepts:
+ * - Multiple warps per block (2×2 warp configuration)
+ * - Each warp computes multiple output tiles (tile sweeping)
+ * - Tile distributions with replication (B matrix)
+ * - Using static_for to iterate over tiles
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <chrono>
+#include <limits>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+
+using namespace ck_tile;
+
+// Tile Sweeping HGEMM kernel with multiple warps
+template<typename ADataType, typename BDataType, typename CDataType, typename AccDataType>
+struct TileSweepingHgemmKernel
+{
+    static constexpr index_t kWaveSize = 64;   // AMD wave size
+    static constexpr index_t kWarpM = 16;      // MFMA M dimension per warp
+    static constexpr index_t kWarpN = 16;      // MFMA N dimension per warp
+    static constexpr index_t kWarpK = 16;      // MFMA K dimension per instruction
+
+    // Warp configuration: 2×2 warps per block
+    static constexpr index_t MWarp = 2;        // 2 warps in M dimension
+    static constexpr index_t NWarp = 2;        // 2 warps in N dimension
+    static constexpr index_t kBlockSize = MWarp * NWarp * kWaveSize;  // 256 threads
+
+    // No iterations - each warp computes exactly one 16×16 output tile
+
+    // Use ck_tile's WarpGemm for MFMA
+    using WarpGemm = WarpGemmMfmaF16F16F32M16N16K16;
+
+    CK_TILE_DEVICE void operator()(const ADataType* a,
+                                   const BDataType* b,
+                                   const CDataType* c,
+                                   CDataType* d,
+                                   index_t M,
+                                   index_t N,
+                                   index_t K,
+                                   index_t lda,  // Leading dimension of A (column-major)
+                                   index_t ldb,  // Leading dimension of B (row-major)
+                                   index_t ldc,  // Leading dimension of C (column-major)
+                                   index_t ldd,  // Leading dimension of D (column-major)
+                                   AccDataType alpha,
+                                   AccDataType beta) const
+    {
+        // Calculate which warp this thread belongs to within the block
+        const index_t warp_id = get_warp_id();
+        const index_t iMWarp = warp_id / NWarp;  // M-warp index (0 or 1)
+        const index_t iNWarp = warp_id % NWarp;  // N-warp index (0 or 1)
+        const index_t block_id = get_block_id();
+        
+        // Convert linear block_id to 2D grid coordinates
+        const index_t num_blocks_n = N / (NWarp * kWarpN);  // Number of blocks in N dimension
+        const index_t block_m = block_id / num_blocks_n;    // M-block index
+        const index_t block_n = block_id % num_blocks_n;    // N-block index
+        
+        // printf("Block %d (grid [%d,%d]), Warp %d (M-warp %d, N-warp %d)\n",
+        //        block_id, block_m, block_n, warp_id, iMWarp, iNWarp);
+
+        // Calculate base offset for this warp's single tile
+        const index_t m_warp_base = block_m * (MWarp * kWarpM) + iMWarp * kWarpM;
+        const index_t n_warp_base = block_n * (NWarp * kWarpN) + iNWarp * kWarpN;
+
+        // Bounds check for the warp's entire region
+        if(m_warp_base >= M || n_warp_base >= N)
+            return;
+
+        // Create tensor views for matrices
+        // A is column-major: M×K with stride lda between columns
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a,
+            make_tuple(M, K),         // Shape: M×K
+            make_tuple(1, lda),       // Strides: column-major
+            number<1>{},
+            number<1>{}
+        );
+
+        // B is row-major: K×N with stride ldb between rows
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b,
+            make_tuple(K, N),         // Shape: K×N
+            make_tuple(ldb, 1),       // Strides: row-major
+            number<4>{},
+            number<1>{}
+        );
+
+        // C is column-major: M×N with stride ldc between columns
+        const auto c_tensor = make_naive_tensor_view<address_space_enum::global>(
+            c,
+            make_tuple(M, N),         // Shape: M×N
+            make_tuple(1, ldc),       // Strides: column-major
+            number<1>{},
+            number<1>{}
+        );
+
+        // D is column-major: M×N with stride ldd between columns
+        auto d_tensor = make_naive_tensor_view<address_space_enum::global>(
+            d,
+            make_tuple(M, N),         // Shape: M×N
+            make_tuple(1, ldd),       // Strides: column-major
+            number<1>{},
+            number<1>{}
+        );
+
+        // ============================================================================
+        // TILE DISTRIBUTIONS using EMBED API (from verified tests)
+        // ============================================================================
+        
+
+        // Step 1: Warp-level distribution (64 threads within one warp)
+        constexpr auto a_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,                           // No replication at warp level
+            tuple<sequence<16>,                   // H0 (M): 16 M positions
+                  sequence<4, 4>>,                // H1 (K): 4×4 = 16 K elements
+            tuple<sequence<2, 1>>,                // Ps_to_Hs: 2D P-space (64 threads)
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<2>,                          // Ys_to_Hs: Y maps to K
+            sequence<1>>{};                       // Ys_in_Hs
+        
+        // Step 2: Block-level outer distribution (warp organization)
+        // Must have same NDimX as inner (2 dimensions: M and K)
+        constexpr auto a_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<NWarp>,                      // R: Replicate across N-warps
+            tuple<sequence<MWarp>, sequence<>>,  // H: MWarp in M-dim, 1 in K-dim
+            tuple<sequence<0, 1>>,                // Ps_to_Hs
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<>,                       // Ys_to_Hs: Y maps to both M and K
+            sequence<>>{};                    // Ys_in_Hs
+
+        // B warp-level distribution
+        constexpr auto b_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,                           // No replication at warp level
+            tuple<sequence<4, 4>,                 // H0 (K): 4×4 = 16 K elements
+                  sequence<16>>,                  // H1 (N): 16 N positions
+            tuple<sequence<1, 2>>,                // Ps_to_Hs: 1 sequence with 2 values (2D P-space)
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<1>,                          // Ys_to_Hs: Y maps to K
+            sequence<1>>{};                       // Ys_in_Hs
+        
+        // Step 2: Block-level outer distribution (warp organization)
+        // Must have same NDimX as inner (2 dimensions: K and N)
+        constexpr auto b_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<MWarp>,                      // R: Replicate across M-warps
+            tuple<sequence<>, sequence<NWarp>>,  // H: NWarp in N-dim, 1 in K-dim
+            tuple<sequence<2, 0>>,                // Ps_to_Hs
+            tuple<sequence<0, 0>>,                // Ps_in_Hs
+            sequence<>,                       // Ys_to_Hs: Y maps to both K and N
+            sequence<>>{};                    // Ys_in_Hs
+        
+        // Embed to create block-level distributions with replication
+        constexpr auto a_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encode, a_warp_dstr_encode);
+
+        constexpr auto b_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encode, b_warp_dstr_encode);
+
+
+        // /*direct approach*/
+        // constexpr auto a_block_dstr_encode = 
+        //     tile_distribution_encoding<
+        //         sequence<NWarp>,                      // R: REPLICATE across 2 N-warps
+        //         tuple<sequence<MWarp, 16>,            // H0 (M): 2 M-warps × 16 threads = 32 M
+        //               sequence<4, 4>>,                // H1 (K): 4×4 = 16 K elements
+        //         tuple<sequence<0, 1>, sequence<2, 1>>,  // Ps_to_Hs: P0→(R,M), P1→(M,K)
+        //         tuple<sequence<0, 0>, sequence<0, 1>>,  // Ps_in_Hs: positions
+        //         sequence<2>,                          // Ys_to_Hs: Y maps to K (dimension 2)
+        //         sequence<1>>{};                        // Ys_in_Hs: Y at position 1 in K
+
+
+        // // Direct approach (like test_b_distribution_with_replication.cpp)
+        // constexpr auto b_block_dstr_encode = 
+        //     tile_distribution_encoding<
+        //         sequence<MWarp>,                      // R: dimension 0, REPLICATE across 2 M-warps
+        //         tuple<sequence<4, 4>,                 // H: dimension 1 (K): 4×4 = 16 K elements
+        //               sequence<2, 16>>,                  // H: dimension 2 (N): 16 N positions
+        //         tuple<sequence<2, 0>, sequence<1, 2>>,  // Ps_to_Hs: P0→R(dim 0), P1→K(dim 1), P2→N(dim 2)
+        //         tuple<sequence<0, 0>, sequence<0, 1>>,  // Ps_in_Hs: positions
+        //         sequence<1>,                          // Ys_to_Hs: Y maps to K (dimension 1)
+        //         sequence<1>>{};                        // Ys_in_Hs: Y at position 1 in K
+        // /*direct approach*/
+
+        
+        // Use block-level distributions for loading (includes replication)
+        constexpr auto a_block_distribution = make_static_tile_distribution(a_block_dstr_encode);
+        constexpr auto b_block_distribution = make_static_tile_distribution(b_block_dstr_encode);
+        
+        // C Distribution: Create block-level distribution for 32×32 output
+        // No replication needed - each warp computes its own unique output region
+        // 2D P-space for 4 warps: P[0] for M-warp, P[1] for N-warp
+        // constexpr auto c_block_dstr_encode = tile_distribution_encoding<
+        //     sequence<>,                                 // No replication for output
+        //     tuple<sequence<MWarp, 4, 4>,                // H0 (M): 2 M-warps × 16 threads = 32
+        //           sequence<NWarp, 16>>,                 // H1 (N): 2 N-warps × 16 threads = 32
+        //     tuple<sequence<2, 1>, sequence<1, 2>>,      // Ps_to_Hs: P[0]→M, P[1]→N (2D P-space)
+        //     tuple<sequence<0, 0>, sequence<1, 1>>,      // Ps_in_Hs
+        //     sequence<1>,                                // No Y dimension for output
+        //     sequence<2>>{};
+
+        constexpr auto c_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<4, 4>, sequence<16>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<0, 0>>,
+            sequence<1>,
+            sequence<1>>{};
+
+        constexpr auto c_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<>,                                         // No replication for output
+            tuple<sequence<MWarp>,                // H0: M iterations
+                  sequence<NWarp>>,               // H1: N iterations
+            tuple<sequence<2, 1>>,                              // Ps_to_Hs
+            tuple<sequence<0, 0>>,                              // Ps_in_Hs
+            sequence<>,                                     // Ys_to_Hs: Y maps to BOTH M and N
+            sequence<>>{};                                  // Ys_in_Hs
+
+        constexpr auto c_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encode, c_warp_dstr_encode);
+
+        
+        constexpr auto c_block_distribution = make_static_tile_distribution(c_block_dstr_encode);
+
+        // Create block-level windows (one K-chunk at a time)
+        // A: 32×16 (all M-rows × one K-chunk)
+        // B: 16×32 (one K-chunk × all N-columns)
+        auto a_block_window = make_tile_window(
+            a_tensor,
+            make_tuple(number<MWarp * kWarpM>{}, number<kWarpK>{}),  // 32×16
+            {block_m * (MWarp * kWarpM), 0},
+            a_block_distribution
+        );
+
+        auto b_block_window = make_tile_window(
+            b_tensor,
+            make_tuple(number<kWarpK>{}, number<NWarp * kWarpN>{}),  // 16×32
+            {0, block_n * (NWarp * kWarpN)},
+            b_block_distribution
+        );
+
+        // Create block-level accumulator tile (covers all 4 warps)
+        auto c_block_tile = make_static_distributed_tensor<AccDataType>(c_block_distribution);
+        set_tile(c_block_tile, AccDataType{0});
+
+
+        // Main K-loop
+        const index_t num_k_loops = K / kWarpK;
+        for(index_t k_iter = 0; k_iter < num_k_loops; ++k_iter)
+        {
+            // Load block tiles - distribution handles replication automatically
+            // Each warp gets its correct portion based on the distribution encoding
+            const auto a_tile = load_tile(a_block_window);
+            const auto b_tile = load_tile(b_block_window);
+            
+            // Perform MFMA: C += A * B
+            // Each warp updates its portion of the block tile
+            WarpGemm{}(c_block_tile, a_tile, b_tile);
+
+            // // Move windows to next K chunk
+            if(k_iter < num_k_loops - 1) {
+                move_tile_window(a_block_window, {0, kWarpK});
+                move_tile_window(b_block_window, {kWarpK, 0});
+            }
+        }
+
+        // Scale by alpha
+        tile_elementwise_inout([alpha](auto& acc_val) { acc_val *= alpha; }, c_block_tile);
+
+        // Add beta * C if needed (load entire block C)
+        if(std::abs(beta) > 1e-6f)
+        {
+            auto c_block_window = make_tile_window(
+                c_tensor,
+                make_tuple(number<MWarp * kWarpM>{}, number<NWarp * kWarpN>{}),  // 32×32
+                {block_m * (MWarp * kWarpM), block_n * (NWarp * kWarpN)},
+                c_block_distribution
+            );
+
+            const auto c_input_block_tile = load_tile(c_block_window);
+
+            tile_elementwise_inout(
+                [beta](const auto& c_val, auto& acc_val) {
+                    acc_val += beta * c_val;
+                },
+                c_input_block_tile, c_block_tile);
+        }
+
+        // Store final result to D (entire block)
+        auto d_block_window = make_tile_window(
+            d_tensor,
+            make_tuple(number<MWarp * kWarpM>{}, number<NWarp * kWarpN>{}),  // 32×32
+            {block_m * (MWarp * kWarpM), block_n * (NWarp * kWarpN)},
+            c_block_distribution
+        );
+
+        store_tile(d_block_window, c_block_tile);
+    }
+};
+
+// CPU reference for verification
+template<typename InType, typename AccType>
+void reference_gemm_mixed(const std::vector<InType>& a,    // Column-major
+                          const std::vector<InType>& b,    // Row-major
+                          const std::vector<AccType>& c,   // Column-major
+                          std::vector<AccType>& d,         // Column-major
+                          index_t M, index_t N, index_t K,
+                          index_t lda, index_t ldb, index_t ldc, index_t ldd,
+                          AccType alpha, AccType beta)
+{
+    // D = alpha * A * B + beta * C
+    for(index_t n = 0; n < N; ++n) {
+        for(index_t m = 0; m < M; ++m) {
+            AccType sum = 0;
+
+            // Compute A * B
+            for(index_t k = 0; k < K; ++k) {
+                // A is column-major: A[m,k] = a[m + k*lda]
+                // B is row-major: B[k,n] = b[k*ldb + n]
+                sum += static_cast<AccType>(a[m + k * lda]) *
+                       static_cast<AccType>(b[k * ldb + n]);
+            }
+
+            // D[m,n] = alpha * sum + beta * C[m,n]
+            // Both C and D are column-major
+            d[m + n * ldd] = alpha * sum + beta * c[m + n * ldc];
+        }
+    }
+}
+
+// Helper to fill matrix with random values
+template<typename T>
+void fill_random(std::vector<T>& data, T min_val = -1, T max_val = 1)
+{
+    for(auto& val : data) {
+        val = static_cast<T>(min_val + (max_val - min_val) *
+                             static_cast<float>(rand()) / RAND_MAX);
+    }
+}
+
+// Helper to print matrix (for debugging)
+template<typename T>
+void print_matrix(const std::vector<T>& mat, index_t rows, index_t cols,
+                  index_t ld, bool col_major = true, const std::string& name = "Matrix")
+{
+    std::cout << name << " (" << rows << "×" << cols << "):\n";
+    for(index_t i = 0; i < std::min(rows, index_t(8)); ++i) {
+        for(index_t j = 0; j < std::min(cols, index_t(8)); ++j) {
+            index_t idx = col_major ? (i + j * ld) : (i * ld + j);
+            std::cout << std::setw(8) << std::setprecision(3) << mat[idx] << " ";
+        }
+        if(cols > 8) std::cout << "...";
+        std::cout << "\n";
+    }
+    if(rows > 8) std::cout << "...\n";
+    std::cout << "\n";
+}
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Tutorial 06: Tile Sweeping GEMM\n";
+    std::cout << "==================================================\n\n";
+
+    std::cout << "Key features:\n";
+    std::cout << "• Multiple warps per block (2×2 warp configuration)\n";
+    std::cout << "• Each warp sweeps over 4×4 output tiles\n";
+    std::cout << "• Tile distribution with replication (B matrix)\n";
+    std::cout << "• Uses static_for loops for tile iteration\n";
+    std::cout << "• Uses move_tile_window to position windows\n\n";
+
+    // Test configuration - simple 4-warp example
+    constexpr index_t M = 64;
+    constexpr index_t N = 64;
+    constexpr index_t K = 32;
+
+    // Leading dimensions
+    constexpr index_t lda = M;  // Column-major
+    constexpr index_t ldb = N;  // Row-major
+    constexpr index_t ldc = M;  // Column-major
+    constexpr index_t ldd = M;  // Column-major
+
+    using InputType = half_t;   // fp16
+    using AccumType = float;    // fp32
+
+    constexpr AccumType alpha = 2.0f;
+    constexpr AccumType beta = 1.5f;
+
+    std::cout << "Problem configuration:\n";
+    std::cout << "  M×N×K: " << M << "×" << N << "×" << K << "\n";
+    std::cout << "  A: column-major, lda=" << lda << " (fp16)\n";
+    std::cout << "  B: row-major, ldb=" << ldb << " (fp16)\n";
+    std::cout << "  C/D: column-major, ldc=" << ldc << ", ldd=" << ldd << " (fp32)\n";
+    std::cout << "  alpha=" << alpha << ", beta=" << beta << "\n";
+    std::cout << "  Total FLOPs: " << 2*M*N*K << "\n\n";
+
+    // Host memory
+    std::vector<InputType> h_a(M * K);
+    std::vector<InputType> h_b(K * N);
+    std::vector<AccumType> h_c(M * N);
+    std::vector<AccumType> h_d(M * N, std::numeric_limits<AccumType>::quiet_NaN());
+    std::vector<AccumType> h_d_ref(M * N);
+
+    // Initialize matrices
+    srand(42);
+    fill_random(h_a, InputType(-1), InputType(1));
+    fill_random(h_b, InputType(-1), InputType(1));
+    fill_random(h_c, AccumType(-1), AccumType(1));
+
+    // CPU reference
+    auto cpu_start = std::chrono::high_resolution_clock::now();
+    reference_gemm_mixed(h_a, h_b, h_c, h_d_ref, M, N, K, lda, ldb, ldc, ldd, alpha, beta);
+    auto cpu_end = std::chrono::high_resolution_clock::now();
+    double cpu_time_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
+
+    // Device memory
+    DeviceMem d_a(M * K * sizeof(InputType));
+    DeviceMem d_b(K * N * sizeof(InputType));
+    DeviceMem d_c(M * N * sizeof(AccumType));
+    DeviceMem d_d(M * N * sizeof(AccumType));
+
+    d_a.ToDevice(h_a.data(), M * K * sizeof(InputType));
+    d_b.ToDevice(h_b.data(), K * N * sizeof(InputType));
+    d_c.ToDevice(h_c.data(), M * N * sizeof(AccumType));
+    d_d.ToDevice(h_d.data(), M * N * sizeof(AccumType));
+
+    // Launch kernel
+    constexpr index_t block_size = 256;  // 4 warps (2×2 configuration)
+    constexpr index_t tiles_per_block_m = 2;  // MWarp (no iterations)
+    constexpr index_t tiles_per_block_n = 2;  // NWarp (no iterations)
+    const index_t grid_size = (M / (tiles_per_block_m * 16)) * (N / (tiles_per_block_n * 16));
+
+    std::cout << "Launching kernel:\n";
+    std::cout << "  Grid: " << grid_size << " blocks\n";
+    std::cout << "  Block: " << block_size << " threads (4 warps in 2×2 config)\n";
+    std::cout << "  Each warp computes: ONE 16×16 output tile\n";
+    std::cout << "  Each block computes: " << tiles_per_block_m*16 << "×" << tiles_per_block_n*16 << " output\n";
+    std::cout << "  Total output tiles: " << (M/16) << "×" << (N/16) << "\n";
+    std::cout << "  MFMA instructions per warp: " << (K/16) << "\n\n";
+
+    stream_config stream;
+
+    // Warmup
+    for(int i = 0; i < 5; ++i) {
+        launch_kernel(stream,
+                     make_kernel<block_size>(
+                         TileSweepingHgemmKernel<InputType, InputType, AccumType, AccumType>{},
+                         dim3(grid_size),
+                         dim3(block_size),
+                         0,
+                         static_cast<const InputType*>(d_a.GetDeviceBuffer()),
+                         static_cast<const InputType*>(d_b.GetDeviceBuffer()),
+                         static_cast<const AccumType*>(d_c.GetDeviceBuffer()),
+                         static_cast<AccumType*>(d_d.GetDeviceBuffer()),
+                         M, N, K, lda, ldb, ldc, ldd, alpha, beta));
+    }
+    hip_check_error(hipDeviceSynchronize());
+
+    // Timed run
+    auto gpu_start = std::chrono::high_resolution_clock::now();
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     TileSweepingHgemmKernel<InputType, InputType, AccumType, AccumType>{},
+                     dim3(grid_size),
+                     dim3(block_size),
+                     0,
+                     static_cast<const InputType*>(d_a.GetDeviceBuffer()),
+                     static_cast<const InputType*>(d_b.GetDeviceBuffer()),
+                     static_cast<const AccumType*>(d_c.GetDeviceBuffer()),
+                     static_cast<AccumType*>(d_d.GetDeviceBuffer()),
+                     M, N, K, lda, ldb, ldc, ldd, alpha, beta));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    auto gpu_end = std::chrono::high_resolution_clock::now();
+    double gpu_time_ms = std::chrono::duration<double, std::milli>(gpu_end - gpu_start).count();
+
+    // Get result
+    d_d.FromDevice(h_d.data(), M * N * sizeof(AccumType));
+
+    // Verify
+    bool passed = true;
+    float max_error = 0;
+    index_t error_count = 0;
+
+    for(index_t i = 0; i < M * N; ++i) {
+        float error = std::abs(h_d[i] - h_d_ref[i]);
+        max_error = std::max(max_error, error);
+        if(error > 1e-2f) {  // Relaxed tolerance for fp16
+            if(error_count < 5) {
+                index_t m = i % M;
+                index_t n = i / M;
+                std::cout << "Error at [" << m << "," << n << "]: "
+                          << h_d[i] << " vs " << h_d_ref[i]
+                          << " (diff=" << error << ")\n";
+            }
+            error_count++;
+        }
+    }
+
+    passed = (error_count == 0);
+
+    // Calculate performance
+    double gflops = 2.0 * M * N * K / 1e9;
+    double gpu_tflops = gflops / (gpu_time_ms / 1000);
+    double cpu_gflops = gflops / (cpu_time_ms / 1000);
+
+    std::cout << "Results:\n";
+    std::cout << "  Correctness: " << (passed ? "✓ PASSED" : "✗ FAILED") << "\n";
+    std::cout << "  Max error: " << max_error << "\n";
+    if(!passed) std::cout << "  Error count: " << error_count << "/" << M*N << "\n";
+    std::cout << "\n";
+
+    std::cout << "Performance:\n";
+    std::cout << "  CPU time: " << cpu_time_ms << " ms (" << cpu_gflops << " GFLOPS)\n";
+    std::cout << "  GPU time: " << gpu_time_ms << " ms (" << gpu_tflops << " TFLOPS)\n";
+    std::cout << "  Speedup: " << cpu_time_ms / gpu_time_ms << "x\n\n";
+
+#ifdef DEBUG_OUTPUT
+    // Print sample outputs for debugging
+    print_matrix(h_a, M, K, lda, true, "A (col-major)");
+    print_matrix(h_b, K, N, ldb, false, "B (row-major)");
+    print_matrix(h_c, M, N, ldc, true, "C (col-major)");
+    print_matrix(h_d_ref, M, N, ldd, true, "D_ref (col-major)");
+    print_matrix(h_d, M, N, ldd, true, "D_gpu (col-major)");
+#endif
+
+    std::cout << "=== Key Insights ===\n";
+    std::cout << "• Tile sweeping allows warps to compute multiple output tiles\n";
+    std::cout << "• static_for loops iterate over tiles at compile time\n";
+    std::cout << "• move_tile_window positions windows at different tiles\n";
+    std::cout << "• A matrix REPLICATES across N-warps (warps in same M-row need same A)\n";
+    std::cout << "• B matrix REPLICATES across M-warps (warps in same N-column need same B)\n";
+    std::cout << "• Replication in R parameter: A has sequence<NWarp>, B has sequence<MWarp>\n";
+    std::cout << "• This pattern scales to production GEMM kernels\n";
+    std::cout << "• 2×2 warp config with 4×4 tiles per warp = 128×128 output per block\n\n";
+
+    return passed ? 0 : 1;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Tutorial 07: Tile Sweeping with Y-Dimension Repetition
+# Demonstrates true tile sweeping using Y-dimension repetition in distributions
+# Follows the pattern from 02_gemm for production-ready code
+
+# Create executable for tile sweeping with Y-repetition tutorial
+add_executable(aa_tutorial_07_tile_sweeping_y_repetition tile_sweeping_with_y_repetition.cpp)
+
+# Set properties
+target_include_directories(aa_tutorial_07_tile_sweeping_y_repetition PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Compile flags
+# target_compile_options(aa_tutorial_07_tile_sweeping_y_repetition PRIVATE
+#     -Wall
+#     -O0
+#     -g
+#     --save-temps
+# )
+
+# Message for build output
+message(STATUS "Added Tutorial 07: Tile Sweeping with Y-Dimension Repetition - Multiple warps with Y-repetition for true tile sweeping")
+
+# Add test subdirectory
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tests/CMakeLists.txt)
+    add_subdirectory(tests)
+endif()
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/CMakeLists.txt
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Tests for Tutorial 07: Tile Sweeping with Y-Dimension Repetition
+
+# Test A distribution with Y-repetition
+add_executable(test_a_distribution_y_repetition test_a_distribution_y_repetition.cpp)
+target_include_directories(test_a_distribution_y_repetition PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+
+# Test B distribution with Y-repetition
+add_executable(test_b_distribution_y_repetition test_b_distribution_y_repetition.cpp)
+target_include_directories(test_b_distribution_y_repetition PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+
+# Test B Y-slicing with get_y_sliced_thread_data
+add_executable(test_b_y_slicing test_b_y_slicing.cpp)
+target_include_directories(test_b_y_slicing PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+
+# Test A Y-slicing with get_y_sliced_thread_data
+add_executable(test_a_y_slicing test_a_y_slicing.cpp)
+target_include_directories(test_a_y_slicing PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+)
+
+message(STATUS "Added Tutorial 07 tests: A and B distributions with Y-repetition, A and B Y-slicing")
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_a_distribution_y_repetition.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_a_distribution_y_repetition.cpp
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test A matrix distribution with Y-dimension repetition
+ *
+ * Goal: Load a 64x16 A matrix with 256 threads (4 warps in 2x2 config)
+ * With MIterPerWarp=2, each warp should load 2 tiles of 16x16 in M dimension
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestADistributionYRepetitionKernel
+{
+    static constexpr index_t kBlockSize = 256;
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t MIterPerWarp = 2;
+    static constexpr index_t KIterPerWarp = 1;
+    static constexpr index_t kM = 64;  // 2 warps × 2 iters × 16
+    static constexpr index_t kK = 64;
+
+    CK_TILE_DEVICE void operator()(const DataType* a,
+                                   DataType* debug_output,
+                                   index_t lda) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for A (column-major)
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a, make_tuple(kM, kK), make_tuple(1, lda), number<1>{}, number<1>{});
+
+        // Step 2: Block-level outer distribution (warp organization)
+        // Must have same NDimX as inner (2 dimensions: M and K)
+        // constexpr auto a_block_outer_dstr_encode = tile_distribution_encoding<
+        //     sequence<NWarp>,                      // R: Replicate across N-warps
+        //     tuple<sequence<MWarp>, sequence<>>,  // H: MWarp in M-dim, 1 in K-dim
+        //     tuple<sequence<0, 1>>,                // Ps_to_Hs
+        //     tuple<sequence<0, 0>>,                // Ps_in_Hs
+        //     sequence<>,                       // Ys_to_Hs: Y maps to both M and K
+        //     sequence<>>{};                    // Ys_in_Hs
+
+
+        // A distribution with Y-repetition (from tutorial_07)
+        constexpr auto a_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<16>, sequence<4, 4>>,
+            tuple<sequence<2, 1>>,
+            tuple<sequence<0, 0>>,
+            sequence<2>,
+            sequence<1>>{};
+
+        constexpr auto a_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<NWarp>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+            tuple<sequence<0, 1>>,
+            tuple<sequence<0, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto a_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encode, a_warp_dstr_encode);
+
+        constexpr auto a_distribution = make_static_tile_distribution(a_block_dstr_encode);
+
+        auto a_window = make_tile_window(
+            a_tensor, make_tuple(number<kM>{}, number<kK>{}),
+            {0, 0}, a_distribution);
+
+        const auto a_tile = load_tile(a_window);
+        const auto& thread_buffer = a_tile.get_thread_buffer();
+
+        // Print from all warps sequentially
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== A Distribution with Y-Repetition Test ===\n");
+            printf("Matrix: 64×16 (MWarp=2, MIterPerWarp=2, each warp loads 2×16 tiles)\n");
+            printf("Input: A[m,k] = m + k*100 (unique values)\n\n");
+        }
+        
+        __syncthreads();
+        
+        for(int w = 0; w < 4; ++w) {
+            __syncthreads();
+            if(warp_id == w && lane_id == 0) {
+                printf("Warp %d (M-warp %d, N-warp %d):\n", 
+                       w, w/NWarp, w%NWarp);
+                printf("  Thread buffer size: %d\n", static_cast<int>(thread_buffer.size()));
+                printf("  Values: ");
+                for(int i = 0; i < thread_buffer.size(); ++i) {
+                    printf("%.0f ", static_cast<float>(thread_buffer[i]));
+                }
+                printf("\n");
+            }
+        }
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Expected Pattern ===\n");
+            printf("Each warp should load 8 elements (2 M-iters × 1 K-iter × 4 warp elements)\n");
+            printf("Warp 0 (M-warp 0): Should have M-rows [0-15] and [16-31]\n");
+            printf("Warp 2 (M-warp 1): Should have M-rows [32-47] and [48-63]\n");
+            printf("Warps 0&2 should be identical (NWarp replication)\n");
+            printf("Warps 1&3 should be identical (NWarp replication)\n");
+        }
+
+        // Store for verification
+        for(int i = 0; i < thread_buffer.size(); ++i) {
+            debug_output[tid * 8 + i] = thread_buffer[i];
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test A Distribution with Y-Dimension Repetition\n";
+    std::cout << "==================================================\n\n";
+
+    constexpr index_t M = 64;
+    constexpr index_t K = 64;
+    constexpr index_t lda = M;
+
+    using DataType = half_t;
+
+    std::vector<DataType> h_a(M * K);
+    std::vector<DataType> h_debug(256 * 8, -1);
+
+    // Initialize A[m,k] = m + k*100 (unique for each position)
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t m = 0; m < M; ++m) {
+            h_a[m + k * lda] = static_cast<DataType>(m + k * 100);
+        }
+    }
+
+    DeviceMem d_a(M * K * sizeof(DataType));
+    DeviceMem d_debug(256 * 8 * sizeof(DataType));
+
+    d_a.ToDevice(h_a.data(), M * K * sizeof(DataType));
+    d_debug.ToDevice(h_debug.data(), 256 * 8 * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestADistributionYRepetitionKernel<DataType>{},
+                     dim3(1), dim3(256), 0,
+                     static_cast<const DataType*>(d_a.GetDeviceBuffer()),
+                     static_cast<DataType*>(d_debug.GetDeviceBuffer()),
+                     lda));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    d_debug.FromDevice(h_debug.data(), 256 * 8 * sizeof(DataType));
+
+    std::cout << "\n✓ Test completed - check GPU output above\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_a_y_slicing.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_a_y_slicing.cpp
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test A matrix Y-slicing with get_y_sliced_thread_data
+ *
+ * This test verifies that Y-dimension slicing works correctly for A by:
+ * 1. Loading the full block tile (64×16)
+ * 2. Using get_y_sliced_thread_data to extract individual iteration tiles
+ * 3. Printing what each warp gets for each iteration
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestAYSlicingKernel
+{
+    static constexpr index_t kBlockSize = 256;
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t MIterPerWarp = 2;
+    static constexpr index_t KIterPerWarp = 1;
+    static constexpr index_t kM = 64;  // 2 warps × 2 iters × 16
+    static constexpr index_t kK = 16;  // Fixed to match distribution coverage
+
+    CK_TILE_DEVICE void operator()(const DataType* a,
+                                   index_t lda) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for A (column-major M×K)
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a, make_tuple(kM, kK), make_tuple(1, lda), number<1>{}, number<1>{});
+
+        // A warp-level distribution
+        constexpr auto a_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<16>, sequence<4, 4>>,
+            tuple<sequence<2, 1>>,
+            tuple<sequence<0, 0>>,
+            sequence<2>,
+            sequence<1>>{};
+
+        // A block-level outer distribution with Y-repetition
+        constexpr auto a_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<NWarp>,
+            tuple<sequence<MIterPerWarp, MWarp>,
+                  sequence<KIterPerWarp>>,
+            tuple<sequence<0, 1>>,
+            tuple<sequence<0, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto a_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encode, a_warp_dstr_encode);
+
+        constexpr auto a_block_distribution = make_static_tile_distribution(a_block_dstr_encode);
+
+        // Get Y-dimension information
+        using AWarpDstr = decltype(make_static_tile_distribution(a_warp_dstr_encode));
+        constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+
+        // Create window and load full block tile
+        auto a_window = make_tile_window(
+            a_tensor, make_tuple(number<kM>{}, number<kK>{}),
+            {0, 0}, a_block_distribution);
+
+        const auto a_block_tile = load_tile(a_window);
+
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== A Y-Slicing Test ===\n");
+            printf("Block tile: %d×%d (M×K)\n", kM, kK);
+            printf("MIterPerWarp=%d, KIterPerWarp=%d\n", MIterPerWarp, KIterPerWarp);
+            printf("Input: A[m,k] = m*1000 + k (unique values)\n\n");
+        }
+        
+        __syncthreads();
+
+        // Test Y-slicing for each warp and iteration
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // Extract warp tensor for this iteration
+                auto a_warp_tensor = make_static_distributed_tensor<DataType>(
+                    make_static_tile_distribution(a_warp_dstr_encode));
+                
+                // CORRECTED: kIter first, then mIter (matching the Ys_to_Hs order)
+                a_warp_tensor.get_thread_buffer() = a_block_tile.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                const auto& warp_buffer = a_warp_tensor.get_thread_buffer();
+
+                // Print from each warp sequentially
+                for(int w = 0; w < 4; ++w) {
+                    __syncthreads();
+                    if(warp_id == w && lane_id == 0) {
+                        printf("Warp %d (M-warp %d, N-warp %d), MIter=%d, KIter=%d:\n", 
+                               w, w/NWarp, w%NWarp, static_cast<int>(mIter), static_cast<int>(kIter));
+                        printf("  Buffer size: %d\n", static_cast<int>(warp_buffer.size()));
+                        printf("  Values: ");
+                        for(int i = 0; i < warp_buffer.size() && i < 16; ++i) {
+                            printf("%.0f ", static_cast<float>(warp_buffer[i]));
+                        }
+                        if(warp_buffer.size() > 16) printf("...");
+                        printf("\n");
+                    }
+                }
+            });
+        });
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Expected Pattern ===\n");
+            printf("Each warp should get 4 elements per iteration (16 M × 16 K / 64 threads)\n");
+            printf("Warp 0, MIter=0: Should have values from A[0:16, 0:16]\n");
+            printf("Warp 0, MIter=1: Should have values from A[16:32, 0:16]\n");
+            printf("Warp 2, MIter=0: Should have values from A[32:48, 0:16]\n");
+            printf("Warp 2, MIter=1: Should have values from A[48:64, 0:16]\n");
+            printf("Warps 0&1 should REPLICATE (NWarp replication)\n");
+            printf("Warps 2&3 should REPLICATE (NWarp replication)\n");
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test A Y-Slicing with get_y_sliced_thread_data\n";
+    std::cout << "==================================================\n\n";
+
+    constexpr index_t M = 64;  // 2 warps × 2 iters × 16
+    constexpr index_t K = 16;  // Match distribution coverage
+    constexpr index_t lda = M;
+
+    using DataType = half_t;
+
+    std::vector<DataType> h_a(M * K);
+
+    // Initialize A[m,k] = m*1000 + k (easy to identify position)
+    auto counter = 0;
+    for(index_t m = 0; m < M; ++m) {
+        for(index_t k = 0; k < K; ++k) {
+            h_a[m + k * lda] = static_cast<DataType>(counter++);
+        }
+    }
+
+    std::cout << "Matrix A (M×K = " << M << "×" << K << "):\n";
+    std::cout << "Sample values:\n";
+    std::cout << "  A[0,0] = " << static_cast<float>(h_a[0]) << "\n";
+    std::cout << "  A[16,0] = " << static_cast<float>(h_a[16]) << "\n";
+    std::cout << "  A[32,0] = " << static_cast<float>(h_a[32]) << "\n";
+    std::cout << "  A[48,0] = " << static_cast<float>(h_a[48]) << "\n";
+    std::cout << "  A[0,15] = " << static_cast<float>(h_a[15*lda]) << "\n\n";
+
+    DeviceMem d_a(M * K * sizeof(DataType));
+    d_a.ToDevice(h_a.data(), M * K * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestAYSlicingKernel<DataType>{},
+                     dim3(1), dim3(256), 0,
+                     static_cast<const DataType*>(d_a.GetDeviceBuffer()),
+                     lda));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    std::cout << "\n✓ Test completed - check GPU output above\n";
+    std::cout << "\nIf Y-slicing works correctly, you should see:\n";
+    std::cout << "- Each warp gets different M-row ranges for different iterations\n";
+    std::cout << "- Warps 0&1 should have identical values (NWarp replication)\n";
+    std::cout << "- Warps 2&3 should have identical values (NWarp replication)\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_b_distribution_y_repetition.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_b_distribution_y_repetition.cpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test B matrix distribution with Y-dimension repetition
+ *
+ * Goal: Load a 16x64 B matrix with 256 threads (4 warps in 2x2 config)
+ * With NIterPerWarp=2, each warp should load 2 tiles of 16x16 in N dimension
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestBDistributionYRepetitionKernel
+{
+    static constexpr index_t kBlockSize = 256;
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t NIterPerWarp = 2;
+    static constexpr index_t KIterPerWarp = 1;
+    static constexpr index_t kK = 64;
+    static constexpr index_t kN = 64;  // 2 warps × 2 iters × 16
+
+    CK_TILE_DEVICE void operator()(const DataType* b,
+                                   DataType* debug_output,
+                                   index_t ldb) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        //each warp is 64 x 4 items and 4 warps total and 2 iteration, so totally it becomes 64 x 32 we don't cover the whole matrix
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for B (row-major)
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b, make_tuple(kK, kN), make_tuple(ldb, 1), number<4>{}, number<1>{});
+
+        // B distribution with Y-repetition (from tutorial_07)
+        constexpr auto b_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<4, 4>, sequence<16>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<0, 0>>,
+            sequence<1>,
+            sequence<1>>{};
+
+        constexpr auto b_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<MWarp>,
+            tuple<sequence<KIterPerWarp>,
+                  sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<2, 0>>,
+            tuple<sequence<1, 0>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto b_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encode, b_warp_dstr_encode);
+
+        constexpr auto b_distribution = make_static_tile_distribution(b_block_dstr_encode);
+
+        auto b_window = make_tile_window(
+            b_tensor, make_tuple(number<kK>{}, number<kN>{}),
+            {0, 0}, b_distribution);
+
+        const auto b_tile = load_tile(b_window);
+        const auto& thread_buffer = b_tile.get_thread_buffer();
+
+        // Print from all warps sequentially
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== B Distribution with Y-Repetition Test ===\n");
+            printf("Matrix: 16×64 (NWarp=2, NIterPerWarp=2, each warp loads 2×16 tiles)\n");
+            printf("Input: B[k,n] = k + n*100 (unique values)\n\n");
+        }
+        
+        __syncthreads();
+        
+        for(int w = 0; w < 4; ++w) {
+            __syncthreads();
+            if(warp_id == w && lane_id == 0) {
+                printf("Warp %d (M-warp %d, N-warp %d):\n", 
+                       w, w/NWarp, w%NWarp);
+                printf("  Thread buffer size: %d\n", static_cast<int>(thread_buffer.size()));
+                printf("  Values: ");
+                for(int i = 0; i < thread_buffer.size(); ++i) {
+                    printf("%.0f ", static_cast<float>(thread_buffer[i]));
+                }
+                printf("\n");
+            }
+        }
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Expected Pattern ===\n");
+            printf("Each warp should load 8 elements (2 N-iters × 1 K-iter × 4 warp elements)\n");
+            printf("Warp 0 (N-warp 0): Should have N-cols [0-15] and [16-31]\n");
+            printf("Warp 1 (N-warp 1): Should have N-cols [32-47] and [48-63]\n");
+            printf("Warps 0&1 should be identical (MWarp replication)\n");
+            printf("Warps 2&3 should be identical (MWarp replication)\n");
+        }
+
+        // Store for verification
+        for(int i = 0; i < thread_buffer.size(); ++i) {
+            debug_output[tid * 8 + i] = thread_buffer[i];
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test B Distribution with Y-Dimension Repetition\n";
+    std::cout << "==================================================\n\n";
+
+    constexpr index_t K = 64;
+    constexpr index_t N = 64;
+    constexpr index_t ldb = N;
+
+    using DataType = half_t;
+
+    std::vector<DataType> h_b(K * N);
+    std::vector<DataType> h_debug(256 * 8, -1);
+
+    // Initialize B[k,n] = k + n*100 (unique for each position)
+    auto counter = 0;
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t n = 0; n < N; ++n) {
+            h_b[k * ldb + n] = static_cast<DataType>(counter++);
+        }
+    }
+
+    DeviceMem d_b(K * N * sizeof(DataType));
+    DeviceMem d_debug(256 * 8 * sizeof(DataType));
+
+    d_b.ToDevice(h_b.data(), K * N * sizeof(DataType));
+    d_debug.ToDevice(h_debug.data(), 256 * 8 * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestBDistributionYRepetitionKernel<DataType>{},
+                     dim3(1), dim3(256), 0,
+                     static_cast<const DataType*>(d_b.GetDeviceBuffer()),
+                     static_cast<DataType*>(d_debug.GetDeviceBuffer()),
+                     ldb));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    d_debug.FromDevice(h_debug.data(), 256 * 8 * sizeof(DataType));
+
+    std::cout << "\n✓ Test completed - check GPU output above\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_b_y_slicing.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tests/test_b_y_slicing.cpp
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Test B matrix Y-slicing with get_y_sliced_thread_data
+ *
+ * This test verifies that Y-dimension slicing works correctly by:
+ * 1. Loading the full block tile (16×64)
+ * 2. Using get_y_sliced_thread_data to extract individual iteration tiles
+ * 3. Printing what each warp gets for each iteration
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using namespace ck_tile;
+
+template<typename DataType>
+struct TestBYSlicingKernel
+{
+    static constexpr index_t kBlockSize = 256;
+    static constexpr index_t MWarp = 2;
+    static constexpr index_t NWarp = 2;
+    static constexpr index_t NIterPerWarp = 2;
+    static constexpr index_t KIterPerWarp = 1;
+    static constexpr index_t kK = 16;  // Fixed to match distribution coverage
+    static constexpr index_t kN = 64;  // 2 warps × 2 iters × 16
+
+    CK_TILE_DEVICE void operator()(const DataType* b,
+                                   index_t ldb) const
+    {
+        if(get_block_id() != 0)
+            return;
+
+        const index_t tid = threadIdx.x;
+        const index_t warp_id = tid / 64;
+        const index_t lane_id = tid % 64;
+
+        // Create tensor view for B (row-major K×N)
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b, make_tuple(kK, kN), make_tuple(ldb, 1), number<4>{}, number<1>{});
+
+        // B warp-level distribution
+        constexpr auto b_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<4, 4>, sequence<16>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<0, 0>>,
+            sequence<1>,
+            sequence<1>>{};
+
+        // B block-level outer distribution with Y-repetition
+        constexpr auto b_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<MWarp>,
+            tuple<sequence<KIterPerWarp>,
+                  sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<2, 0>>,
+            tuple<sequence<1, 0>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto b_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encode, b_warp_dstr_encode);
+
+        constexpr auto b_block_distribution = make_static_tile_distribution(b_block_dstr_encode);
+
+        // Get Y-dimension information
+        using BWarpDstr = decltype(make_static_tile_distribution(b_warp_dstr_encode));
+        constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+
+        // Create window and load full block tile
+        auto b_window = make_tile_window(
+            b_tensor, make_tuple(number<kK>{}, number<kN>{}),
+            {0, 0}, b_block_distribution);
+
+        const auto b_block_tile = load_tile(b_window);
+
+        __syncthreads();
+        
+        if(tid == 0) {
+            printf("\n=== B Y-Slicing Test ===\n");
+            printf("Block tile: %d×%d (K×N)\n", kK, kN);
+            printf("NIterPerWarp=%d, KIterPerWarp=%d\n", NIterPerWarp, KIterPerWarp);
+            printf("Input: B[k,n] = k*1000 + n (unique values)\n\n");
+        }
+        
+        __syncthreads();
+
+        // Test Y-slicing for each warp and iteration
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // Extract warp tensor for this iteration
+                auto b_warp_tensor = make_static_distributed_tensor<DataType>(
+                    make_static_tile_distribution(b_warp_dstr_encode));
+                
+                b_warp_tensor.get_thread_buffer() = b_block_tile.get_y_sliced_thread_data(
+                    merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                const auto& warp_buffer = b_warp_tensor.get_thread_buffer();
+
+                // Print from each warp sequentially
+                for(int w = 0; w < 4; ++w) {
+                    __syncthreads();
+                    if(warp_id == w && lane_id == 0) {
+                        printf("Warp %d (M-warp %d, N-warp %d), NIter=%d, KIter=%d:\n", 
+                               w, w/NWarp, w%NWarp, static_cast<int>(nIter), static_cast<int>(kIter));
+                        printf("  Buffer size: %d\n", static_cast<int>(warp_buffer.size()));
+                        printf("  Values: ");
+                        for(int i = 0; i < warp_buffer.size() && i < 16; ++i) {
+                            printf("%.0f ", static_cast<float>(warp_buffer[i]));
+                        }
+                        if(warp_buffer.size() > 16) printf("...");
+                        printf("\n");
+                    }
+                }
+            });
+        });
+
+        __syncthreads();
+
+        if(tid == 0) {
+            printf("\n=== Expected Pattern ===\n");
+            printf("Each warp should get 4 elements per iteration (16 K × 16 N / 64 threads)\n");
+            printf("Warp 0, NIter=0: Should have values from B[0:16, 0:16]\n");
+            printf("Warp 0, NIter=1: Should have values from B[0:16, 16:32]\n");
+            printf("Warp 1, NIter=0: Should have values from B[0:16, 32:48]\n");
+            printf("Warp 1, NIter=1: Should have values from B[0:16, 48:64]\n");
+            printf("Warps 2&3 should REPLICATE warps 0&1 (MWarp replication)\n");
+        }
+    }
+};
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Test B Y-Slicing with get_y_sliced_thread_data\n";
+    std::cout << "==================================================\n\n";
+
+    constexpr index_t K = 16;  // Match distribution coverage
+    constexpr index_t N = 64;
+    constexpr index_t ldb = N;
+
+    using DataType = half_t;
+
+    std::vector<DataType> h_b(K * N);
+
+    // Initialize B[k,n] = k*1000 + n (easy to identify position)
+    auto counter = 0;
+    for(index_t k = 0; k < K; ++k) {
+        for(index_t n = 0; n < N; ++n) {
+            h_b[k * ldb + n] = static_cast<DataType>(counter++);
+        }
+    }
+
+    std::cout << "Matrix B (K×N = " << K << "×" << N << "):\n";
+    std::cout << "Sample values:\n";
+    std::cout << "  B[0,0] = " << static_cast<float>(h_b[0]) << "\n";
+    std::cout << "  B[0,16] = " << static_cast<float>(h_b[16]) << "\n";
+    std::cout << "  B[0,32] = " << static_cast<float>(h_b[32]) << "\n";
+    std::cout << "  B[0,48] = " << static_cast<float>(h_b[48]) << "\n";
+    std::cout << "  B[15,0] = " << static_cast<float>(h_b[15*ldb]) << "\n\n";
+
+    DeviceMem d_b(K * N * sizeof(DataType));
+    d_b.ToDevice(h_b.data(), K * N * sizeof(DataType));
+
+    stream_config stream;
+    launch_kernel(stream,
+                 make_kernel<256>(
+                     TestBYSlicingKernel<DataType>{},
+                     dim3(1), dim3(256), 0,
+                     static_cast<const DataType*>(d_b.GetDeviceBuffer()),
+                     ldb));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    std::cout << "\n✓ Test completed - check GPU output above\n";
+    std::cout << "\nIf Y-slicing works correctly, you should see:\n";
+    std::cout << "- Each warp gets different N-column ranges for different iterations\n";
+    std::cout << "- Warps 0&2 should have identical values (MWarp replication)\n";
+    std::cout << "- Warps 1&3 should have identical values (MWarp replication)\n";
+
+    return 0;
+}
--- a/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tile_sweeping_with_y_repetition.cpp
+++ b/example/ck_tile/99_toy_tutorial/tutorial_07_tile_sweeping_with_y_repetition/tile_sweeping_with_y_repetition.cpp
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+ * Tutorial 07: Tile Sweeping with Y-Dimension Repetition
+ *
+ * Demonstrates TRUE tile sweeping where each warp iterates over multiple tiles
+ * using Y-dimension repetition in the distribution encoding. This follows the
+ * pattern from 02_gemm/block_gemm_asmem_bsmem_creg.hpp.
+ *
+ * Key concepts:
+ * - Multiple warps per block (2×2 warp configuration) - SAME as Tutorial 06
+ * - Y-dimension repetition enables each warp to sweep over multiple tiles
+ * - MIterPerWarp and NIterPerWarp control tile iterations
+ * - get_y_sliced_thread_data extracts specific tiles from block tensor
+ * - static_for loops iterate over tile indices at compile time
+ * - Tile distributions with replication still work with Y-repetition
+ */
+
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <chrono>
+#include <limits>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+
+using namespace ck_tile;
+
+// Tile Sweeping HGEMM kernel with Y-dimension repetition
+template<typename ADataType, typename BDataType, typename CDataType, typename AccDataType>
+struct TileSweepingYRepetitionHgemmKernel
+{
+    static constexpr index_t kWaveSize = 64;   // AMD wave size
+    static constexpr index_t kWarpM = 16;      // MFMA M dimension per warp
+    static constexpr index_t kWarpN = 16;      // MFMA N dimension per warp
+    static constexpr index_t kWarpK = 16;      // MFMA K dimension per instruction
+
+    // Warp configuration: 2×2 warps per block (SAME as Tutorial 06)
+    static constexpr index_t MWarp = 2;        // 2 warps in M dimension
+    static constexpr index_t NWarp = 2;        // 2 warps in N dimension
+    static constexpr index_t kBlockSize = MWarp * NWarp * kWaveSize;  // 256 threads
+
+    // NEW: Tile iterations per warp (Y-dimension repetition)
+    static constexpr index_t MIterPerWarp = 2;  // Each warp sweeps 2 tiles in M
+    static constexpr index_t NIterPerWarp = 2;  // Each warp sweeps 2 tiles in N
+    static constexpr index_t KIterPerWarp = 1;  // K handled in main loop
+
+    // Use ck_tile's WarpGemm for MFMA
+    using WarpGemm = WarpGemmMfmaF16F16F32M16N16K16;
+
+    CK_TILE_DEVICE void operator()(const ADataType* a,
+                                   const BDataType* b,
+                                   const CDataType* c,
+                                   CDataType* d,
+                                   index_t M,
+                                   index_t N,
+                                   index_t K,
+                                   index_t lda,  // Leading dimension of A (column-major)
+                                   index_t ldb,  // Leading dimension of B (row-major)
+                                   index_t ldc,  // Leading dimension of C (column-major)
+                                   index_t ldd,  // Leading dimension of D (column-major)
+                                   AccDataType alpha,
+                                   AccDataType beta) const
+    {
+        // Calculate which warp this thread belongs to within the block
+        [[maybe_unused]] const index_t warp_id = get_warp_id();
+        [[maybe_unused]] const index_t iMWarp = warp_id / NWarp;  // M-warp index (0 or 1)
+        [[maybe_unused]] const index_t iNWarp = warp_id % NWarp;  // N-warp index (0 or 1)
+
+        // Calculate base offset for this block
+        // Each block now computes (MWarp × MIterPerWarp × kWarpM) × (NWarp × NIterPerWarp × kWarpN)
+        const index_t kMPerBlock = MWarp * MIterPerWarp * kWarpM;  // 2×2×16 = 64
+        const index_t kNPerBlock = NWarp * NIterPerWarp * kWarpN;  // 2×2×16 = 64
+        
+        // Calculate block position in 2D grid
+        const index_t num_blocks_n = N / kNPerBlock;
+        const index_t block_m = get_block_id() / num_blocks_n;
+        const index_t block_n = get_block_id() % num_blocks_n;
+        
+        const index_t m_block_base = block_m * kMPerBlock;
+        const index_t n_block_base = block_n * kNPerBlock;
+
+        // Bounds check
+        if(m_block_base >= M || n_block_base >= N)
+            return;
+
+        // Create tensor views for matrices
+        const auto a_tensor = make_naive_tensor_view<address_space_enum::global>(
+            a,
+            make_tuple(M, K),
+            make_tuple(1, lda),
+            number<1>{},
+            number<1>{}
+        );
+
+        const auto b_tensor = make_naive_tensor_view<address_space_enum::global>(
+            b,
+            make_tuple(K, N),
+            make_tuple(ldb, 1),
+            number<4>{},
+            number<1>{}
+        );
+
+        const auto c_tensor = make_naive_tensor_view<address_space_enum::global>(
+            c,
+            make_tuple(M, N),
+            make_tuple(1, ldc),
+            number<1>{},
+            number<1>{}
+        );
+
+        auto d_tensor = make_naive_tensor_view<address_space_enum::global>(
+            d,
+            make_tuple(M, N),
+            make_tuple(1, ldd),
+            number<1>{},
+            number<1>{}
+        );
+
+        // ============================================================================
+        // TILE DISTRIBUTIONS with Y-DIMENSION REPETITION (following 02_gemm pattern)
+        // ============================================================================
+        
+        // A Distribution: Block-level with Y-repetition
+        // Warp-level distribution (same as Tutorial 06)
+        constexpr auto a_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<16>, sequence<4, 4>>,
+            tuple<sequence<2, 1>>,
+            tuple<sequence<0, 0>>,
+            sequence<2>,
+            sequence<1>>{};
+
+        // constexpr auto a_block_outer_dstr_encoding =
+        //     tile_distribution_encoding<sequence<NWarp>,
+        //                                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+        //                                tuple<sequence<1, 0>>,
+        //                                tuple<sequence<1, 0>>,
+        //                                sequence<1, 2>,
+        //                                sequence<0, 0>>{};
+        
+        // Block-level outer distribution with Y-repetition
+        constexpr auto a_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<NWarp>,                                                        // Replicate across N-warps
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,               // H0: 2 iters × 2 warps in M
+            tuple<sequence<0, 1>>,                              // Ps_to_Hs
+            tuple<sequence<0, 1>>,                              // Ps_in_Hs
+            sequence<1, 2>,                                     // Ys_to_Hs: Y maps to BOTH M and K
+            sequence<0, 0>>{};                                  // Ys_in_Hs
+
+        constexpr auto a_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encode, a_warp_dstr_encode);
+
+
+        // B Distribution: Block-level with Y-repetition
+        constexpr auto b_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<4, 4>, sequence<16>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<0, 0>>,
+            sequence<1>,
+            sequence<1>>{};
+        
+        // constexpr auto b_block_outer_dstr_encode =
+        //     tile_distribution_encoding<sequence<MWarp>,
+        //                                tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+        //                                tuple<sequence<0, 1>>,
+        //                                tuple<sequence<0, 1>>,
+        //                                sequence<1, 2>,
+        //                                sequence<0, 0>>{};
+
+        constexpr auto b_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<MWarp>,                                    // Replicate across M-warps
+            tuple<sequence<KIterPerWarp>,                // H0: 2 iters × 2 warps in N
+                  sequence<NIterPerWarp, NWarp>>,                      // H1: 1 K-chunk
+            tuple<sequence<2, 0>>,                              // Ps_to_Hs
+            tuple<sequence<1, 0>>,                              // Ps_in_Hs
+            sequence<1, 2>,                                     // Ys_to_Hs: Y maps to BOTH N and K
+            sequence<0, 0>>{};                                  // Ys_in_Hs
+
+        constexpr auto b_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encode, b_warp_dstr_encode);
+
+        // // C Distribution: Block-level with Y-repetition for output
+        constexpr auto c_warp_dstr_encode = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<4, 4>, sequence<16>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<0, 0>>,
+            sequence<1>,
+            sequence<1>>{};
+
+        // constexpr auto c_block_outer_dstr_encode = tile_distribution_encoding<
+        //     sequence<>,
+        //     tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+        //     tuple<sequence<1, 2>>,
+        //     tuple<sequence<1, 1>>,
+        //     sequence<1, 2>,
+        //     sequence<0, 0>>{};
+
+        constexpr auto c_block_outer_dstr_encode = tile_distribution_encoding<
+            sequence<>,                                         // No replication for output
+            tuple<sequence<MIterPerWarp, MWarp>,                // H0: M iterations
+                  sequence<NIterPerWarp, NWarp>>,               // H1: N iterations
+            tuple<sequence<2, 1>>,                              // Ps_to_Hs
+            tuple<sequence<1, 1>>,                              // Ps_in_Hs
+            sequence<1, 2>,                                     // Ys_to_Hs: Y maps to BOTH M and N
+            sequence<0, 0>>{};                                  // Ys_in_Hs
+
+        constexpr auto c_block_dstr_encode = 
+            detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encode, c_warp_dstr_encode);
+
+        // Create distributions
+        constexpr auto a_block_distribution = make_static_tile_distribution(a_block_dstr_encode);
+        constexpr auto b_block_distribution = make_static_tile_distribution(b_block_dstr_encode);
+        constexpr auto c_block_distribution = make_static_tile_distribution(c_block_dstr_encode);
+
+        // Get Y-dimension information for slicing
+        using AWarpDstr = decltype(make_static_tile_distribution(a_warp_dstr_encode));
+        using BWarpDstr = decltype(make_static_tile_distribution(b_warp_dstr_encode));
+        using CWarpDstr = decltype(make_static_tile_distribution(c_warp_dstr_encode));
+
+        constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+        constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+        constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+    //     // Create block-level windows
+        auto a_block_window = make_tile_window(
+            a_tensor,
+            make_tuple(number<kMPerBlock>{}, number<kWarpK>{}),
+            {m_block_base, 0},
+            a_block_distribution
+        );
+
+        auto b_block_window = make_tile_window(
+            b_tensor,
+            make_tuple(number<kWarpK>{}, number<kNPerBlock>{}),
+            {0, n_block_base},
+            b_block_distribution
+        );
+
+    //     // Create block-level accumulator tile
+        auto c_block_tile = make_static_distributed_tensor<AccDataType>(c_block_distribution);
+        set_tile(c_block_tile, AccDataType{0});
+
+    //     // Main K-loop
+        const index_t num_k_loops = K / kWarpK;
+        for(index_t k_iter = 0; k_iter < num_k_loops; ++k_iter)
+        {
+            // Load entire block tiles (all iterations at once)
+            const auto a_block_tile = load_tile(a_block_window);
+            const auto b_block_tile = load_tile(b_block_window);
+            
+            // Nested loops over tile iterations using Y-slicing (like 02_gemm)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    // Extract A warp tensor for this M-iteration using Y-slicing
+                    auto a_warp_tensor = make_static_distributed_tensor<ADataType>(
+                        make_static_tile_distribution(a_warp_dstr_encode));
+                    
+                    a_warp_tensor.get_thread_buffer() = a_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                    
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // Extract B warp tensor for this N-iteration using Y-slicing
+                        auto b_warp_tensor = make_static_distributed_tensor<BDataType>(
+                            make_static_tile_distribution(b_warp_dstr_encode));
+                        
+                        b_warp_tensor.get_thread_buffer() = b_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                        
+                        // Extract C warp tensor for this M,N iteration
+                        auto c_warp_tensor = make_static_distributed_tensor<AccDataType>(
+                            make_static_tile_distribution(c_warp_dstr_encode));
+                        
+                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                        
+                        // Warp GEMM: C[mIter, nIter] += A[mIter, kIter] * B[nIter, kIter]
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                        
+                        // Write C warp tensor back to block tensor
+                        c_block_tile.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+
+            // Move windows to next K chunk
+            if(k_iter < num_k_loops - 1) {
+                move_tile_window(a_block_window, {0, kWarpK});
+                move_tile_window(b_block_window, {kWarpK, 0});
+            }
+        }
+
+        // Scale by alpha
+        tile_elementwise_inout([alpha](auto& acc_val) { acc_val *= alpha; }, c_block_tile);
+
+        // Add beta * C if needed
+        if(std::abs(beta) > 1e-6f)
+        {
+            auto c_block_window = make_tile_window(
+                c_tensor,
+                make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+                {m_block_base, n_block_base},
+                c_block_distribution
+            );
+
+            const auto c_input_block_tile = load_tile(c_block_window);
+
+            tile_elementwise_inout(
+                [beta](const auto& c_val, auto& acc_val) {
+                    acc_val += beta * c_val;
+                },
+                c_input_block_tile, c_block_tile);
+        }
+
+        // Store final result to D
+        auto d_block_window = make_tile_window(
+            d_tensor,
+            make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+            {m_block_base, n_block_base},
+            c_block_distribution
+        );
+
+        store_tile(d_block_window, c_block_tile);
+    }
+};
+
+// CPU reference for verification
+template<typename InType, typename AccType>
+void reference_gemm_mixed(const std::vector<InType>& a,
+                          const std::vector<InType>& b,
+                          const std::vector<AccType>& c,
+                          std::vector<AccType>& d,
+                          index_t M, index_t N, index_t K,
+                          index_t lda, index_t ldb, index_t ldc, index_t ldd,
+                          AccType alpha, AccType beta)
+{
+    for(index_t n = 0; n < N; ++n) {
+        for(index_t m = 0; m < M; ++m) {
+            AccType sum = 0;
+            for(index_t k = 0; k < K; ++k) {
+                sum += static_cast<AccType>(a[m + k * lda]) *
+                       static_cast<AccType>(b[k * ldb + n]);
+            }
+            d[m + n * ldd] = alpha * sum + beta * c[m + n * ldc];
+        }
+    }
+}
+
+template<typename T>
+void fill_random(std::vector<T>& data, T min_val = -1, T max_val = 1)
+{
+    for(auto& val : data) {
+        val = static_cast<T>(min_val + (max_val - min_val) *
+                             static_cast<float>(rand()) / RAND_MAX);
+    }
+}
+
+int main()
+{
+    std::cout << "\n==================================================\n";
+    std::cout << "Tutorial 07: Tile Sweeping with Y-Dimension Repetition\n";
+    std::cout << "==================================================\n\n";
+
+    std::cout << "Key features:\n";
+    std::cout << "• Multiple warps per block (2×2 warp configuration)\n";
+    std::cout << "• Y-dimension repetition: MIterPerWarp=2, NIterPerWarp=2\n";
+    std::cout << "• Each warp sweeps over 2×2 = 4 output tiles\n";
+    std::cout << "• Uses get_y_sliced_thread_data for tile extraction\n";
+    std::cout << "• Follows 02_gemm pattern for production-ready code\n\n";
+
+    // Problem size: Each block computes 64×64 (2×2 warps × 2×2 iters × 16×16)
+    // constexpr index_t M = 128;
+    // constexpr index_t N = 128;
+    // constexpr index_t K = 64;
+    
+    constexpr index_t M = 128;
+    constexpr index_t N = 128;
+    constexpr index_t K = 64;
+
+    constexpr index_t lda = M;
+    constexpr index_t ldb = N;
+    constexpr index_t ldc = M;
+    constexpr index_t ldd = M;
+
+    using InputType = half_t;
+    using AccumType = float;
+
+    constexpr AccumType alpha = 2.0f;
+    constexpr AccumType beta = 1.5f;
+
+    std::cout << "Problem configuration:\n";
+    std::cout << "  M×N×K: " << M << "×" << N << "×" << K << "\n";
+    std::cout << "  Block output: 64×64 (2 warps × 2 iters × 16)\n";
+    std::cout << "  Warp output: 32×32 (2 iters × 16 in each dim)\n";
+    std::cout << "  Total blocks: " << (M/64) << "×" << (N/64) << "\n\n";
+
+    // Host memory
+    std::vector<InputType> h_a(M * K);
+    std::vector<InputType> h_b(K * N);
+    std::vector<AccumType> h_c(M * N);
+    std::vector<AccumType> h_d(M * N, std::numeric_limits<AccumType>::quiet_NaN());
+    std::vector<AccumType> h_d_ref(M * N);
+
+    srand(42);
+    fill_random(h_a, InputType(-1), InputType(1));
+    fill_random(h_b, InputType(-1), InputType(1));
+    fill_random(h_c, AccumType(-1), AccumType(1));
+
+    // CPU reference
+    auto cpu_start = std::chrono::high_resolution_clock::now();
+    reference_gemm_mixed(h_a, h_b, h_c, h_d_ref, M, N, K, lda, ldb, ldc, ldd, alpha, beta);
+    auto cpu_end = std::chrono::high_resolution_clock::now();
+    double cpu_time_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
+
+    // Device memory
+    DeviceMem d_a(M * K * sizeof(InputType));
+    DeviceMem d_b(K * N * sizeof(InputType));
+    DeviceMem d_c(M * N * sizeof(AccumType));
+    DeviceMem d_d(M * N * sizeof(AccumType));
+
+    d_a.ToDevice(h_a.data(), M * K * sizeof(InputType));
+    d_b.ToDevice(h_b.data(), K * N * sizeof(InputType));
+    d_c.ToDevice(h_c.data(), M * N * sizeof(AccumType));
+    d_d.ToDevice(h_d.data(), M * N * sizeof(AccumType));
+
+    // Launch kernel
+    constexpr index_t block_size = 256;
+    const index_t grid_size = (M / 64) * (N / 64);  // 64×64 per block
+
+    std::cout << "Launching kernel:\n";
+    std::cout << "  Grid: " << grid_size << " blocks\n";
+    std::cout << "  Block: " << block_size << " threads (4 warps in 2×2 config)\n";
+    std::cout << "  Each warp: 2×2 tile iterations = 4 tiles of 16×16\n";
+    std::cout << "  Each block: 64×64 output\n\n";
+
+    stream_config stream;
+
+    // Warmup
+    for(int i = 0; i < 5; ++i) {
+        launch_kernel(stream,
+                     make_kernel<block_size>(
+                         TileSweepingYRepetitionHgemmKernel<InputType, InputType, AccumType, AccumType>{},
+                         dim3(grid_size),
+                         dim3(block_size),
+                         0,
+                         static_cast<const InputType*>(d_a.GetDeviceBuffer()),
+                         static_cast<const InputType*>(d_b.GetDeviceBuffer()),
+                         static_cast<const AccumType*>(d_c.GetDeviceBuffer()),
+                         static_cast<AccumType*>(d_d.GetDeviceBuffer()),
+                         M, N, K, lda, ldb, ldc, ldd, alpha, beta));
+    }
+    hip_check_error(hipDeviceSynchronize());
+
+    // Timed run
+    auto gpu_start = std::chrono::high_resolution_clock::now();
+
+    launch_kernel(stream,
+                 make_kernel<block_size>(
+                     TileSweepingYRepetitionHgemmKernel<InputType, InputType, AccumType, AccumType>{},
+                     dim3(grid_size),
+                     dim3(block_size),
+                     0,
+                     static_cast<const InputType*>(d_a.GetDeviceBuffer()),
+                     static_cast<const InputType*>(d_b.GetDeviceBuffer()),
+                     static_cast<const AccumType*>(d_c.GetDeviceBuffer()),
+                     static_cast<AccumType*>(d_d.GetDeviceBuffer()),
+                     M, N, K, lda, ldb, ldc, ldd, alpha, beta));
+
+    hip_check_error(hipDeviceSynchronize());
+
+    auto gpu_end = std::chrono::high_resolution_clock::now();
+    double gpu_time_ms = std::chrono::duration<double, std::milli>(gpu_end - gpu_start).count();
+
+    // Get result
+    d_d.FromDevice(h_d.data(), M * N * sizeof(AccumType));
+
+    // Verify
+    bool passed = true;
+    float max_error = 0;
+    index_t error_count = 0;
+
+    for(index_t i = 0; i < M * N; ++i) {
+        float error = std::abs(h_d[i] - h_d_ref[i]);
+        max_error = std::max(max_error, error);
+        if(error > 1e-2f) {
+            if(error_count < 5) {
+                index_t m = i % M;
+                index_t n = i / M;
+                std::cout << "Error at [" << m << "," << n << "]: "
+                          << h_d[i] << " vs " << h_d_ref[i]
+                          << " (diff=" << error << ")\n";
+            }
+            error_count++;
+        }
+    }
+
+    passed = (error_count == 0);
+
+    double gflops = 2.0 * M * N * K / 1e9;
+    double gpu_tflops = gflops / (gpu_time_ms / 1000);
+    double cpu_gflops = gflops / (cpu_time_ms / 1000);
+
+    std::cout << "Results:\n";
+    std::cout << "  Correctness: " << (passed ? "✓ PASSED" : "✗ FAILED") << "\n";
+    std::cout << "  Max error: " << max_error << "\n";
+    if(!passed) std::cout << "  Error count: " << error_count << "/" << M*N << "\n";
+    std::cout << "\n";
+
+    std::cout << "Performance:\n";
+    std::cout << "  CPU time: " << cpu_time_ms << " ms (" << cpu_gflops << " GFLOPS)\n";
+    std::cout << "  GPU time: " << gpu_time_ms << " ms (" << gpu_tflops << " TFLOPS)\n";
+    std::cout << "  Speedup: " << cpu_time_ms / gpu_time_ms << "x\n\n";
+
+    std::cout << "=== Key Insights ===\n";
+    std::cout << "• Y-dimension repetition enables tile sweeping within distributions\n";
+    std::cout << "• MIterPerWarp and NIterPerWarp control how many tiles each warp processes\n";
+    std::cout << "• get_y_sliced_thread_data extracts specific tiles from block tensor\n";
+    std::cout << "• static_for loops iterate over tile indices at compile time\n";
+    std::cout << "• Replication still works: A replicates across NWarp, B across MWarp\n";
+    std::cout << "• This pattern scales to production kernels (see 02_gemm)\n";
+    std::cout << "• Each warp: 2×2 iters × 16×16 per tile = 32×32 output\n";
+    std::cout << "• Each block: 2×2 warps × 32×32 per warp = 64×64 output\n\n";
+
+    return passed ? 0 : 1;
+}