Merge branch 'develop' into ckTileEnginePooling

2026-04-19 22:39:03 +00:00 · 2025-12-12 10:33:06 +00:00
parent f6d2243288 8d7a4e0c73
commit d5cef00770
911 changed files with 46206 additions and 11163 deletions
--- a/.github/scripts/therock_configure_ci.py
+++ b/.github/scripts/therock_configure_ci.py
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 import fnmatch
 import json
 import os
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -20,7 +20,7 @@ jobs:
    permissions:
      id-token: write
    container:
-      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:2f3ebd0beb04c449fdb36933e54bdc69483b914fb9005594d3fc9444c206b54b
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
      options: -v /runner/config:/home/awsconfig/
    env:
      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
@@ -54,7 +54,7 @@ jobs:
        with:
          repository: "ROCm/TheRock"
          path: "TheRock"
-          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
+          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit

      - name: Setup ccache
        run: |
--- a/.github/workflows/therock-ci.yml
+++ b/.github/workflows/therock-ci.yml
@@ -65,7 +65,7 @@ jobs:
        -DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON
        -DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../
      amdgpu_families: "gfx94X-dcgpu"
-      test_runs_on: "linux-mi325-1gpu-ossci-rocm"
+      test_runs_on: "linux-mi325-1gpu-ossci-rocm-frac"

  therock_ci_summary:
    name: TheRock CI Summary
--- a/.github/workflows/therock-test-component.yml
+++ b/.github/workflows/therock-test-component.yml
@@ -51,7 +51,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: "ROCm/TheRock"
-          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
+          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit

      - name: Run setup test environment workflow
        uses: './.github/actions/setup_test_environment'
--- a/.github/workflows/therock-test-packages.yml
+++ b/.github/workflows/therock-test-packages.yml
@@ -27,7 +27,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
+          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit

      - name: "Configuring CI options"
        env:
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,9 @@ tags
 # Editors
 .vscode

+# CMake formatting configuration (local)
+.cmake-format.yaml
+
 # Cline
 .cline*

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,12 +20,12 @@ repos:
            )$
 -   repo: local
    hooks:
-    # -   id: copyright-year-checker
-    #     name: copyright-year-checker
-    #     entry: script/check_copyright_year.sh
-    #     verbose: false
-    #     language: script
-    #     types: [c++]
+    -   id: copyright-header-checker
+        name: Check copyright headers
+        entry: script/check_copyright_year.sh
+        verbose: false
+        language: script
+        types_or: [c++, python, shell, cmake]
    -   id: remove-exec-bit
        name: Remove executable bit from non-executable files
        entry: script/remove_exec_bit.sh
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,10 +2,21 @@

 Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).

+## (Unreleased) Composable Kernel 1.3.0
+
+### Added
+* Added support for explicit GEMM in CK_TILE grouped convolution forward and backward weight.
+* Added TF32 convolution support on gfx942 and gfx950 in CK. It could be enabled/disabled via `DTYPES` of "tf32".
+
+### Changed
+
+### Upcoming changes
+
 ## Composable Kernel 1.2.0 for ROCm 7.2.0

 ### Added
 * Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle.
+* Added Col-Col-Row-Col layout support for aquant mode in blockscale GEMM.
 * Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM
 * Added a compute async pipeline in the CK TILE universal GEMM on gfx950
 * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 cmake_minimum_required(VERSION 3.14)
 if(POLICY CMP0140)
  # policies CMP0140 not known to CMake until 3.25
@@ -39,10 +42,12 @@ option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
 option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
 option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF)
 option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
+option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
+option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)

 if(CK_EXPERIMENTAL_BUILDER)
    add_definitions(-DCK_EXPERIMENTAL_BUILDER)
-    include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include)  
+    include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include)
 endif()

 # Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
@@ -87,6 +92,10 @@ if (DTYPES)
        add_definitions(-DCK_ENABLE_FP32)
        set(CK_ENABLE_FP32 "ON")
    endif()
+    if (DTYPES MATCHES "tf32")
+        # definition will be added based on the GPU target in the following section
+        set(CK_ENABLE_TF32 "ON")
+    endif()
    if (DTYPES MATCHES "fp64")
        add_definitions(-DCK_ENABLE_FP64)
        set(CK_ENABLE_FP64 "ON")
@@ -101,6 +110,7 @@ else()
    set(CK_ENABLE_INT8 "ON")
    set(CK_ENABLE_FP16 "ON")
    set(CK_ENABLE_FP32 "ON")
+    set(CK_ENABLE_TF32 "ON")
    set(CK_ENABLE_FP64 "ON")
    set(CK_ENABLE_BF16 "ON")
    set(CK_ENABLE_FP8 "ON")
@@ -229,12 +239,12 @@ message(STATUS "Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}"
 # Cache SUPPORTED_GPU_TARGETS for debug
 set(SUPPORTED_GPU_TARGETS "${SUPPORTED_GPU_TARGETS}" CACHE STRING "List of supported GPU targets")

-if (SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx11|gfx12" AND NOT FORCE_DISABLE_XDL)
    message(STATUS "Enabling XDL instances")
    add_definitions(-DCK_USE_XDL)
    set(CK_USE_XDL "ON")
 endif()
-if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+if ((SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95") AND NOT FORCE_DISABLE_XDL)
    message(STATUS "Enabling XDL FP8 gemms on native architectures")
    add_definitions(-DCK_USE_GFX94)
    set(CK_USE_GFX94 "ON")
@@ -247,7 +257,7 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx10")
    add_definitions(-DCK_GFX1030_SUPPORT)
 endif()

-if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+if ((SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") AND NOT FORCE_DISABLE_WMMA)
    message(STATUS "Enabling WMMA instances")
    add_definitions(-DCK_USE_WMMA)
    set(CK_USE_WMMA "ON")
@@ -257,7 +267,7 @@ endif()
 # define the macro with the current value (0 or 1)
 add_definitions(-DCK_TILE_USE_WMMA=${CK_TILE_USE_WMMA})

-if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12" AND NOT FORCE_DISABLE_WMMA)
    message(STATUS "Enabling WMMA FP8 gemms on native architectures")
    add_definitions(-DCK_USE_WMMA_FP8)
    set(CK_USE_WMMA_FP8 "ON")
@@ -277,6 +287,15 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
    set(CK_GFX950_SUPPORT "ON")
 endif()

+if ((SUPPORTED_GPU_TARGETS MATCHES "gfx942" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95") AND CK_ENABLE_TF32)
+    add_definitions(-DCK_ENABLE_TF32)
+    set(CK_ENABLE_TF32 "ON")
+else()
+    message(STATUS "Disabling TF32 instances")
+    remove_definitions(-DCK_ENABLE_TF32)
+    set(CK_ENABLE_TF32 "OFF")
+endif()
+
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
 if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
    add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
@@ -646,6 +665,9 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
    if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
        set(add_inst 1)
    endif()
+    if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32")
+        set(add_inst 1)
+    endif()
    if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
        set(add_inst 1)
    endif()
@@ -739,6 +761,13 @@ rocm_install(FILES
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
 )

+if(CK_EXPERIMENTAL_BUILDER)
+    rocm_install(DIRECTORY
+        ${PROJECT_SOURCE_DIR}/experimental/builder/include/ck_tile/builder
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile
+    )
+endif()
+
 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
 set(CPACK_RPM_PACKAGE_LICENSE "MIT")

--- a/6
+++ b/6
@@ -1,7 +1,7 @@

 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=7.0.1
+ARG ROCMVERSION=7.1.1
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -13,8 +13,8 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN set -xe && \
    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl

-RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
-    apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
+RUN wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
+    apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
    apt update && \
    apt install python3-setuptools python3-wheel -y && \
    apt install rocm-dev -y
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.1.1"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
--- a/Dockerfile.pytorch
+++ b/Dockerfile.pytorch
@@ -20,4 +20,13 @@ RUN groupadd -g 109 render && \
    git clone -b "$CK_PYTORCH_BRANCH" https://github.com/ROCm/composable_kernel.git && \
    chown -R jenkins:jenkins /tmp/pytorch && \
    chmod -R a+rwx /tmp/pytorch && \
-    sudo usermod -aG irc jenkins
+    sudo usermod -aG irc jenkins && \
+    #install hipblaslt
+    git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-libraries.git && \
+    cd rocm-libraries && \
+    git checkout develop && \
+    git sparse-checkout init --cone && \
+    git sparse-checkout set projects/hipblaslt shared/origami && \
+    cd projects/hipblaslt && \
+    git show --oneline -s && \
+    CPLUS_INCLUDE_PATH="/opt/amdgpu/include/" ./install.sh -idc --architecture="gfx942;gfx950" -j 128 --skip_rocroller
--- a/36
+++ b/36
@@ -288,7 +288,7 @@ def getBaseDockerImageName(){
    }
    else{
        def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
-        if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
+        if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 2 ){
            img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
            }
        else{
@@ -434,7 +434,7 @@ def buildDocker(install_prefix){
    }
    catch(Exception ex){
        echo "Unable to locate image: ${image_name}. Building image now"
-        retimage = docker.build("${image_name}", dockerArgs + ' .')
+        retimage = docker.build("${image_name}", dockerArgs)
        withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
            retimage.push()
        }
@@ -447,7 +447,7 @@ def get_docker_options(){
        dockerOpts = "--network=host --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
    }
    else{ //only add kfd and dri paths if you actually going to run somthing on GPUs
-        dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
    }
    if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
    // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
@@ -834,12 +834,14 @@ def Build_CK(Map conf=[:]){
                    if (params.hipTensor_test && arch == "gfx90a" ){
                        // build and test hipTensor on gfx90a node
                        sh """#!/bin/bash
-                            rm -rf "${params.hipTensor_branch}".zip
-                            rm -rf hipTensor-"${params.hipTensor_branch}"
-                            wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
-                            unzip -o "${params.hipTensor_branch}".zip
+                            rm -rf rocm-libraries
+                            git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-libraries.git
+                            cd rocm-libraries
+                            git sparse-checkout init --cone
+                            git sparse-checkout set projects/hiptensor
+                            git checkout "${params.hipTensor_branch}"
                        """
-                        dir("hipTensor-${params.hipTensor_branch}"){
+                        dir("rocm-libraries/projects/hiptensor"){
                            sh """#!/bin/bash
                                mkdir -p build
                                ls -ltr
@@ -1003,7 +1005,7 @@ def run_aiter_tests(Map conf=[:]){
    checkout scm
    //use the latest pytorch image
    def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
-    def dockerOpts=get_docker_options()
+    def dockerOpts=get_docker_options() + ' --group-add irc '

    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
        try
@@ -1055,7 +1057,7 @@ def run_pytorch_tests(Map conf=[:]){
    checkout scm
    //use the latest pytorch-nightly image
    def image = "${env.CK_DOCKERHUB}:ck_pytorch"
-    def dockerOpts=get_docker_options()
+    def dockerOpts=get_docker_options() + ' --group-add irc '

    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
        try
@@ -1095,7 +1097,7 @@ def run_pytorch_tests(Map conf=[:]){
 //launch develop branch daily jobs
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true;FORCE_CI=true
                                              0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
-                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX101=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true;BUILD_PACKAGES=true
+                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX101=false;BUILD_GFX908=false;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true;BUILD_PACKAGES=true
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
@@ -1121,8 +1123,8 @@ pipeline {
            description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
        string(
            name: 'ROCMVERSION',
-            defaultValue: '7.0.1',
-            description: 'Specify which ROCM version to use: 7.0.1 (default).')
+            defaultValue: '7.1.1',
+            description: 'Specify which ROCM version to use: 7.1.1 (default).')
        string(
            name: 'COMPILER_VERSION',
            defaultValue: '',
@@ -1615,11 +1617,13 @@ pipeline {
                                            -D GPU_TARGETS="gfx90a" \
                                            -D GEMM_DATATYPE="fp8;fp16" \
                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D GEMM_STREAMK_DATATYPE="fp8;fp16" \
+                                            -D GEMM_STREAMK_LAYOUT="rcr" \
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
-                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
+                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
@@ -1644,11 +1648,13 @@ pipeline {
                                            -D GPU_TARGETS="gfx942" \
                                            -D GEMM_DATATYPE="fp8;fp16" \
                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D GEMM_STREAMK_DATATYPE="fp8;fp16" \
+                                            -D GEMM_STREAMK_LAYOUT="rcr" \
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
-                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
+                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
--- a/README.md
+++ b/README.md
@@ -187,7 +187,7 @@ limit the number of threads. For example, if you have a 128-core CPU and 128 Gb

 Additional cmake flags can be used to significantly speed-up the build:

-* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
+* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;tf32;fp16;fp8;bf16;int8" to build
  instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
  other data types.

--- a/client_example/01_gemm/CMakeLists.txt
+++ b/client_example/01_gemm/CMakeLists.txt
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_gemm gemm.cpp)
 target_link_libraries(client_gemm PRIVATE composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
--- a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_custom_target(client_gemm_fastgelu_examples)

--- a/client_example/03_gemm_layernorm/CMakeLists.txt
+++ b/client_example/03_gemm_layernorm/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_gemm_add_add_layernorm_naive gemm_add_add_layernorm_naive.cpp)
    target_link_libraries(client_gemm_add_add_layernorm_naive PRIVATE composable_kernel::device_gemm_operations composable_kernel::device_other_operations)
--- a/client_example/04_contraction/CMakeLists.txt
+++ b/client_example/04_contraction/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
    target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
--- a/client_example/05_layernorm/CMakeLists.txt
+++ b/client_example/05_layernorm/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_layernorm2d_bwd_data layernorm2d_bwd_data.cpp)
 target_link_libraries(client_layernorm2d_bwd_data PRIVATE composable_kernel::device_other_operations)

--- a/client_example/06_softmax/CMakeLists.txt
+++ b/client_example/06_softmax/CMakeLists.txt
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_softmax4d softmax4d.cpp)
 target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_other_operations composable_kernel::device_reduction_operations)
--- a/client_example/07_grouped_convnd_fwd/CMakeLists.txt
+++ b/client_example/07_grouped_convnd_fwd/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
    target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_conv_operations)
--- a/client_example/08_fused_attention/CMakeLists.txt
+++ b/client_example/08_fused_attention/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_fused_attention fused_attention.cpp)
    target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
--- a/client_example/09_quantization/CMakeLists.txt
+++ b/client_example/09_quantization/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES))
    add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
    target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
--- a/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
 target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_conv_operations)

--- a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp)
--- a/client_example/12_elementwise_normalization/CMakeLists.txt
+++ b/client_example/12_elementwise_normalization/CMakeLists.txt
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_elementwise_layernorm2d elementwise_layernorm2d.cpp)
 target_link_libraries(client_elementwise_layernorm2d PRIVATE composable_kernel::device_other_operations)
--- a/client_example/13_batchnorm/CMakeLists.txt
+++ b/client_example/13_batchnorm/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp)
 add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp)
 add_executable(client_batchnorm_infer_nhwc batchnorm_infer_nhwc.cpp)
--- a/client_example/14_instance_id/CMakeLists.txt
+++ b/client_example/14_instance_id/CMakeLists.txt
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_batchnorm_fwd_instance_id batchnorm_fwd_instance_id.cpp)
 target_link_libraries(client_batchnorm_fwd_instance_id PRIVATE composable_kernel::device_other_operations)
--- a/client_example/15_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/15_convnd_bwd_data/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_conv3d_bwd_data_fp16 conv3d_bwd_data_fp16.cpp)
    add_executable(client_conv3d_bwd_data_fp32 conv3d_bwd_data_fp32.cpp)
--- a/client_example/16_convnd_fwd/CMakeLists.txt
+++ b/client_example/16_convnd_fwd/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if((DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
    add_executable(client_conv3d_fwd_fp16 conv3d_fwd_fp16.cpp)
    target_link_libraries(client_conv3d_fwd_fp16 PRIVATE composable_kernel::device_conv_operations)
--- a/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt
+++ b/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_grouped_gemm_fastgelu grouped_gemm_fastgelu.cpp)
    target_link_libraries(client_grouped_gemm_fastgelu PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/18_groupnorm/CMakeLists.txt
+++ b/client_example/18_groupnorm/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_groupnorm_bwd_data groupnorm_bwd_data.cpp)
 target_link_libraries(client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations)

--- a/client_example/19_pool/CMakeLists.txt
+++ b/client_example/19_pool/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
 target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_other_operations)

--- a/client_example/20_splitk_gemm/CMakeLists.txt
+++ b/client_example/20_splitk_gemm/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
  add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
  target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/21_grouped_gemm_bias/CMakeLists.txt
+++ b/client_example/21_grouped_gemm_bias/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp)
    target_link_libraries(client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/22_grouped_gemm/CMakeLists.txt
+++ b/client_example/22_grouped_gemm/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_grouped_gemm_fixed_nk_fp16 grouped_gemm_fixed_nk_fp16.cpp)
    target_link_libraries(client_grouped_gemm_fixed_nk_fp16 PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/23_elementwise_transpose/CMakeLists.txt
+++ b/client_example/23_elementwise_transpose/CMakeLists.txt
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_elementwise_transpose3d elementwise_transpose_3d.cpp)
 target_link_libraries(client_elementwise_transpose3d PRIVATE composable_kernel::device_other_operations)
--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
 # Fwd scaleadd scaleadd relu
 add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
--- a/client_example/25_wrapper/CMakeLists.txt
+++ b/client_example/25_wrapper/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
 target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
 add_executable(client_wrapper_img2col wrapper_img2col.cpp)
--- a/client_example/26_reduce/CMakeLists.txt
+++ b/client_example/26_reduce/CMakeLists.txt
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_reduce_nhwc_c reduce_nhwc_c.cpp)
 target_link_libraries(client_reduce_nhwc_c PRIVATE composable_kernel::device_reduction_operations)
--- a/client_example/27_im2col_col2im/CMakeLists.txt
+++ b/client_example/27_im2col_col2im/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 add_executable(client_image_to_column image_to_column.cpp)
 target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_other_operations)

--- a/client_example/28_gemm_mx/CMakeLists.txt
+++ b/client_example/28_gemm_mx/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx950")
 	add_executable(client_gemm_mx_fp8 gemm_mx_fp8.cpp)
 	target_link_libraries(client_gemm_mx_fp8 PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/29_gemm_add_multiply/CMakeLists.txt
+++ b/client_example/29_gemm_add_multiply/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_gemm_add_multiply gemm_add_multiply.cpp)
    target_link_libraries(client_gemm_add_multiply PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/30_gemm_bf16Aint8B/CMakeLists.txt
+++ b/client_example/30_gemm_bf16Aint8B/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9" AND ((DTYPES MATCHES "int8" AND DTYPES MATCHES "bf16") OR NOT DEFINED DTYPES))
 	add_executable(client_gemm_bias_fastgelu_bf16_i8_bf16 gemm_bias_fastgelu_xdl_bf16_i8.cpp)
 	target_link_libraries(client_gemm_bias_fastgelu_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/31_grouped_gemm_bf16Aint8B/CMakeLists.txt
+++ b/client_example/31_grouped_gemm_bf16Aint8B/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 if(GPU_TARGETS MATCHES "gfx9" AND ((DTYPES MATCHES "int8" AND DTYPES MATCHES "bf16") OR NOT DEFINED DTYPES))
 	add_executable(client_grouped_gemm_bias_fastgelu_bf16_i8_bf16 grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp)
 	target_link_libraries(client_grouped_gemm_bias_fastgelu_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations)
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 cmake_minimum_required(VERSION 3.15)
 project(ck_app)
 add_compile_options(-std=c++20)
@@ -24,6 +27,9 @@ if (DTYPES)
        add_definitions(-DCK_ENABLE_FP32)
        set(CK_ENABLE_FP32 "ON")
    endif()
+    if (DTYPES MATCHES "tf32")
+        set(CK_ENABLE_TF32 "ON")
+    endif()
    if (DTYPES MATCHES "fp64")
        add_definitions(-DCK_ENABLE_FP64)
        set(CK_ENABLE_FP64 "ON")
@@ -38,6 +44,7 @@ else()
    set(CK_ENABLE_INT8 "ON")
    set(CK_ENABLE_FP16 "ON")
    set(CK_ENABLE_FP32 "ON")
+    set(CK_ENABLE_TF32 "ON")
    set(CK_ENABLE_FP64 "ON")
    set(CK_ENABLE_BF16 "ON")
    if (GPU_TARGETS MATCHES "gfx94")
@@ -64,6 +71,14 @@ if (GPU_TARGETS)
        add_definitions(-DCK_USE_FNUZ_FP8)
        set(CK_USE_FNUZ_FP8 "ON")
    endif()
+    if ((GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx95") AND CK_ENABLE_TF32)
+        add_definitions(-DCK_ENABLE_TF32)
+        set(CK_ENABLE_TF32 "ON")
+    else()
+        message(STATUS "Disabling TF32 instances for this target")
+        remove_definitions(-DCK_ENABLE_TF32)
+        set(CK_ENABLE_TF32 "OFF")
+    endif()
 else()
    add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
    set(CK_USE_XDL "ON")
--- a/cmake/ShardInstantiation.cmake
+++ b/cmake/ShardInstantiation.cmake
@@ -35,7 +35,7 @@ function(generate_sharded_instantiations)
    set(GENERATED_SOURCE_FILES "")
    set(EXTERN_TEMPLATE_STATEMENTS "")
    set(CALL_STATEMENTS "")
-    message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
+    message(DEBUG "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")

    set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
    
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 cmake_minimum_required(VERSION 3.16)
 project(composable_kernel_host)

--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 add_subdirectory(rtc)
 file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 find_package(hip)
 file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
 add_library(ck_rtc ${RTC_SOURCES})
--- a/docs/conceptual/ck_tile/CK-tile-index.rst
+++ b/docs/conceptual/ck_tile/CK-tile-index.rst
@@ -0,0 +1,33 @@
+.. _ck_tile_index:
+
+************************
+CK Tile Index
+************************
+
+CK Tile documentation structure:
+
+.. toctree::
+   :maxdepth: 2
+
+   introduction_motivation
+   buffer_views
+   tensor_views
+   tile_distribution
+   coordinate_systems
+   terminology
+   adaptors
+   transforms
+   descriptors
+   tile_window
+   load_store_traits
+   space_filling_curve
+   static_distributed_tensor
+   convolution_example
+   coordinate_movement
+   lds_index_swapping
+   swizzling_example
+   tensor_coordinates
+   sweep_tile
+   encoding_internals
+   thread_mapping
+   hardware/index
--- a/docs/conceptual/ck_tile/MERMAID_DIAGRAMS.md
+++ b/docs/conceptual/ck_tile/MERMAID_DIAGRAMS.md
@@ -0,0 +1,156 @@
+# Mermaid Diagram Management
+
+This document explains how to manage mermaid diagrams in the CK Tile documentation.
+
+## Overview
+
+All mermaid diagrams in the CK Tile documentation have been converted to SVG files for better rendering compatibility. The original mermaid source code is preserved as commented blocks in the RST files, allowing easy updates when needed.
+
+## Directory Structure
+
+- `docs/conceptual/ck_tile/diagrams/` - Contains all SVG diagram files
+- `docs/conceptual/ck_tile/convert_mermaid_to_svg.py` - Initial conversion script (one-time use)
+- `docs/conceptual/ck_tile/update_diagrams.py` - Helper script to regenerate diagrams from comments
+
+## Diagram Format in RST Files
+
+Each diagram follows this format:
+
+```rst
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+   .. mermaid::
+   
+      graph TB
+          A --> B
+          B --> C
+
+.. image:: diagrams/diagram_name.svg
+   :alt: Diagram
+   :align: center
+```
+
+The commented mermaid block won't appear in the rendered documentation but serves as the source for regenerating the SVG.
+
+## Updating Diagrams
+
+### When to Update
+
+You need to regenerate SVG files when:
+- Modifying the mermaid source in a commented block
+- Adding new diagrams
+- Updating diagram styling
+
+### How to Update
+
+1. **Edit the commented mermaid source** in the RST file
+2. **Run the update script**:
+   ```bash
+   # Update all diagrams
+   python docs/conceptual/ck_tile/update_diagrams.py
+   
+   # Update diagrams in a specific file
+   python docs/conceptual/ck_tile/update_diagrams.py transforms.rst
+   
+   # Force regenerate all diagrams (even if SVGs exist)
+   python docs/conceptual/ck_tile/update_diagrams.py --force
+   ```
+
+### Prerequisites
+
+The update script requires [mermaid-cli](https://github.com/mermaid-js/mermaid-cli):
+
+```bash
+npm install -g @mermaid-js/mermaid-cli
+```
+
+## Adding New Diagrams
+
+To add a new mermaid diagram:
+
+1. **Create the commented block** in your RST file:
+   ```rst
+   .. 
+      Original mermaid diagram (edit here, then run update_diagrams.py)
+      
+      .. mermaid::
+      
+         graph TB
+             A --> B
+   ```
+
+2. **Add the image reference** immediately after:
+   ```rst
+   .. image:: diagrams/my_new_diagram.svg
+      :alt: My New Diagram
+      :align: center
+   ```
+
+3. **Generate the SVG**:
+   ```bash
+   python docs/conceptual/ck_tile/update_diagrams.py your_file.rst
+   ```
+
+## Current Diagrams
+
+The following RST files contain mermaid diagrams (40 total):
+
+- `adaptors.rst` (2 diagrams)
+- `convolution_example.rst` (1 diagram)
+- `coordinate_movement.rst` (1 diagram)
+- `descriptors.rst` (2 diagrams)
+- `encoding_internals.rst` (2 diagrams)
+- `lds_index_swapping.rst` (3 diagrams)
+- `load_store_traits.rst` (2 diagrams)
+- `space_filling_curve.rst` (1 diagram)
+- `static_distributed_tensor.rst` (1 diagram)
+- `sweep_tile.rst` (4 diagrams)
+- `tensor_coordinates.rst` (2 diagrams)
+- `thread_mapping.rst` (2 diagrams)
+- `tile_window.rst` (5 diagrams)
+- `transforms.rst` (12 diagrams)
+
+## Troubleshooting
+
+### SVG not generated
+
+- Check that mermaid-cli is installed: `mmdc --version`
+- Verify the mermaid syntax is valid
+- Look for error messages in the script output
+
+### Diagram not updating
+
+- Use `--force` flag to regenerate: `python docs/update_diagrams.py --force`
+- Check that the image reference matches the generated filename
+
+### Pattern not matching
+
+If the update script can't find your commented diagram:
+- Ensure proper indentation (3 spaces for comment block content)
+- Verify the `.. mermaid::` directive is commented
+- Check that the image reference immediately follows the comment block
+
+## Script Details
+
+### update_diagrams.py
+
+This script:
+1. Scans RST files for commented mermaid blocks
+2. Extracts the mermaid source code
+3. Converts to SVG using `mmdc`
+4. Saves to the diagrams directory
+
+**Usage:**
+- `python docs/conceptual/ck_tile/update_diagrams.py` - Check all files, update missing SVGs
+- `python docs/conceptual/ck_tile/update_diagrams.py --force` - Regenerate all SVGs
+- `python docs/conceptual/ck_tile/update_diagrams.py <file.rst>` - Update specific file
+
+### convert_mermaid_to_svg.py
+
+This was the initial conversion script. It:
+1. Found all active `.. mermaid::` directives
+2. Converted them to SVGs
+3. Replaced directives with commented source + image references
+
+This script was used once for the initial conversion and typically doesn't need to be run again.
--- a/docs/conceptual/ck_tile/adaptors.rst
+++ b/docs/conceptual/ck_tile/adaptors.rst
@@ -0,0 +1,391 @@
+.. _ck_tile_adaptors:
+
+Tensor Adaptors - Chaining Transformations
+==========================================
+
+Overview
+--------
+
+While individual :ref:`transforms <ck_tile_transforms>` are effective, TensorAdaptors enable the chaining of multiple transforms together to create complex coordinate transformations. Adaptors can be thought of as transformation pipelines that can reshape, reorder, and restructure tensors in advanced ways.
+
+TensorAdaptors serve as the bridge between individual transforms and the high-level tensor operations used in applications. They provide a composable abstraction that allows developers to build complex data access patterns from simple building blocks.
+
+TensorAdaptor Basics
+--------------------
+
+A TensorAdaptor encapsulates a sequence of :ref:`coordinate transformations <ck_tile_coordinate_systems>`, managing the flow of coordinates through multiple transform stages:
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Adaptor Composition"
+                 subgraph "Single Transform"
+                     direction TB
+                     I1["Input Coords<br/>[0,1,2]"]
+                     T1["Transform<br/>(e.g., Transpose)"]
+                     O1["Output Coords<br/>[2,0,1]"]
+                     I1 --> T1 --> O1
+                 end
+      
+                 subgraph "Chained Transforms"
+                     direction TB
+                     I2["Input<br/>2D"]
+                     T2A["Transform A<br/>(e.g., Merge)"]
+                     M2["Intermediate<br/>1D"]
+                     T2B["Transform B<br/>(e.g., Pad)"]
+                     O2["Output<br/>1D Padded"]
+                     I2 --> T2A --> M2 --> T2B --> O2
+                 end
+             end
+      
+             style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
+   
+   
+
+.. image:: diagrams/adaptors_1.svg
+   :alt: Diagram
+   :align: center
+
+.. image:: diagrams/adaptors_1.svg
+   :alt: Diagram
+   :align: center
+Core Components
+
+~~~~~~~~~~~~~~~
+
+Each TensorAdaptor contains:
+
+- **transforms**: List of individual :ref:`transforms <ck_tile_transforms>` to apply
+- **lower_dimension_hidden_idss**: Mappings between transform stages
+- **upper_dimension_hidden_idss**: Hidden dimension mappings for internal stages
+- **bottom_dimension_hidden_ids**: Input dimension identifiers
+- **top_dimension_hidden_ids**: Output dimension identifiers
+
+The most important method of a TensorAdaptor is ``calculate_bottom_index``, which calculates the lower index from the upper index by applying transforms in reverse order.
+
+Transpose Adaptor: Dimension Reordering
+---------------------------------------
+
+The transpose adaptor reorders tensor dimensions according to a permutation pattern. This operation forms the basis for many tensor manipulations in GPU kernels.
+
+.. code-block:: cpp
+
+   // Create transpose adaptor: [0, 1, 2] → [2, 0, 1]
+   auto transpose_adaptor = make_identity_tensor_adaptor<3>();  // Start with identity
+   
+   // Apply transpose using transform_tensor_adaptor
+   auto transposed_desc = transform_tensor_descriptor(
+       original_desc,
+       make_tuple(make_pass_through_transform(original_desc.get_length(2)),
+                  make_pass_through_transform(original_desc.get_length(0)),
+                  make_pass_through_transform(original_desc.get_length(1))),
+       make_tuple(sequence<2>{}, sequence<0>{}, sequence<1>{}),  // old dims
+       make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})   // new dims
+   );
+   
+   // Alternative: Direct coordinate transformation
+   multi_index<3> top_coord{0, 1, 2};
+   // After transpose [2, 0, 1]: coord becomes [2, 0, 1]
+
+Single-Stage Adaptors: Custom Transform Chains
+----------------------------------------------
+
+Custom adaptors can be created by specifying which transforms to use and how they connect. This provides fine-grained control over the transformation pipeline:
+
+.. code-block:: cpp
+
+   // Create a descriptor that merges 2x3 dimensions into single dimension
+   auto base_desc = make_naive_tensor_descriptor_packed(make_tuple(2, 3));
+   
+   // Apply merge transform
+   auto merged_desc = transform_tensor_descriptor(
+       base_desc,
+       make_tuple(make_merge_transform(make_tuple(2, 3))),
+       make_tuple(sequence<0, 1>{}),  // merge dims 0,1
+       make_tuple(sequence<0>{})      // to single dim 0
+   );
+   
+   // The adaptor is embedded in the :ref:`descriptor <ck_tile_descriptors>`
+   // To use it:
+   multi_index<1> top_coord{5};  // 1D coordinate
+   // This internally calculates: row = 5/3 = 1, col = 5%3 = 2
+
+Chaining Adaptors: Building Complex Transformations
+---------------------------------------------------
+
+The real power of adaptors comes from chaining multiple transformations together to create advanced data access patterns:
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Adaptor Chaining Flow"
+                 subgraph "Adaptor 1"
+                     A1I["Bottom Dims<br/>[0,1]"]
+                     A1T["Transform:<br/>Merge[2,3]"]
+                     A1O["Top Dims<br/>[0]"]
+                 end
+                 
+                 subgraph "Adaptor 2"
+                     A2I["Bottom Dims<br/>[0]"]
+                     A2T["Transform:<br/>Unmerge[2,3]"]
+                     A2O["Top Dims<br/>[0,1]"]
+                 end
+                 
+                 subgraph "Chained Result"
+                     CI["Input 2D<br/>Bottom[0,1]"]
+                     CO["Output 2D<br/>Top[0,1]"]
+                 end
+             end
+             
+             A1I --> A1T
+             A1T --> A1O
+             A1O --> A2I
+             A2I --> A2T
+             A2T --> A2O
+             
+             CI --> A1I
+             A2O --> CO
+             
+             style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+
+.. image:: diagrams/adaptors_2.svg
+   :alt: Diagram
+   :align: center
+
+.. image:: diagrams/adaptors_2.svg
+   :alt: Diagram
+   :align: center
+
+.. code-block:: cpp
+
+   // Start with a 2D descriptor
+   auto desc1 = make_naive_tensor_descriptor_packed(make_tuple(2, 3));
+   
+   // First transformation: merge 2D to 1D
+   auto merged_desc = transform_tensor_descriptor(
+       desc1,
+       make_tuple(make_merge_transform(make_tuple(2, 3))),
+       make_tuple(sequence<0, 1>{}),  // merge dims 0,1
+       make_tuple(sequence<0>{})      // to dim 0
+   );
+   
+   // Second transformation: unmerge 1D back to 2D
+   auto final_desc = transform_tensor_descriptor(
+       merged_desc,
+       make_tuple(make_unmerge_transform(make_tuple(2, 3))),
+       make_tuple(sequence<0>{}),     // from dim 0
+       make_tuple(sequence<0, 1>{})   // to dims 0,1
+   );
+   
+   // The chained transformation is embedded in final_desc
+   // Result should be identity transformation
+
+Transform Addition: Extending Existing Adaptors
+-----------------------------------------------
+
+Existing adaptors can be extended with new transforms using ``transform_tensor_adaptor``. This pattern is useful for adding padding or other modifications to existing transformation pipelines:
+
+.. code-block:: cpp
+
+   // Start with transposed descriptor
+   auto base_desc = make_naive_tensor_descriptor(
+       make_tuple(3, 4),
+       make_tuple(1, 3)   // transposed strides
+   );
+   
+   // Add padding to both dimensions
+   auto padded_desc = transform_tensor_descriptor(
+       base_desc,
+       make_tuple(make_pad_transform(3, 1, 1),   // pad dim 0: 3 → 5
+                  make_pad_transform(4, 0, 0)),   // keep dim 1: 4 → 4
+       make_tuple(sequence<0>{}, sequence<1>{}),  // input dims
+       make_tuple(sequence<0>{}, sequence<1>{})   // output dims (keep 2D)
+   );
+   
+   // Access pattern
+   multi_index<2> padded_coord{1, 2};  // In padded space
+   // Internally calculates: unpadded = [1-1, 2] = [0, 2]
+   // Then applies transpose strides
+
+Advanced Patterns
+-----------------
+
+Complex Nested Transforms
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CK Tile supports complex nested transform patterns that enable advanced data layouts:
+
+.. code-block:: cpp
+
+   // Example: 4D tensor with complex transformations
+   // Shape: [A, B, C, D] with various transforms
+   
+   // 1. Create base descriptor
+   auto base_desc = make_naive_tensor_descriptor_packed(
+       make_tuple(A, B, C, D)
+   );
+   
+   // 2. Apply multiple transformations
+   // First: merge first 3 dimensions
+   auto step1_desc = transform_tensor_descriptor(
+       base_desc,
+       make_tuple(make_merge_transform(make_tuple(A, B, C)),
+                  make_pass_through_transform(D)),
+       make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),  // input mapping
+       make_tuple(sequence<0>{}, sequence<1>{})         // output: 2D
+   );
+   
+   // 3. Then unmerge back but with different grouping
+   auto step2_desc = transform_tensor_descriptor(
+       step1_desc,
+       make_tuple(make_unmerge_transform(make_tuple(A*B, C)),
+                  make_pass_through_transform(D)),
+       make_tuple(sequence<0>{}, sequence<1>{}),        // from 2D
+       make_tuple(sequence<0, 1>{}, sequence<2>{})      // to 3D
+   );
+   
+   // The adaptor chain is embedded in the descriptors
+   // CK optimizes these at compile time
+
+GPU Memory Layout Example
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A practical example showing how adaptors create efficient :ref:`GPU memory access patterns <ck_tile_gpu_basics>`:
+
+.. code-block:: cpp
+
+   // Create descriptor for thread block tile: 64x64
+   // With 8x8 vector loads per thread
+   constexpr auto BlockM = 64;
+   constexpr auto BlockN = 64;
+   constexpr auto VectorM = 8;
+   constexpr auto VectorN = 8;
+   
+   // Thread arrangement: 8x8 threads
+   constexpr auto ThreadM = BlockM / VectorM;  // 8
+   constexpr auto ThreadN = BlockN / VectorN;  // 8
+   
+   // Create block descriptor with proper layout
+   auto block_desc = transform_tensor_descriptor(
+       make_naive_tensor_descriptor_packed(
+           make_tuple(number<BlockM>{}, number<BlockN>{})
+       ),
+       make_tuple(
+           make_unmerge_transform(make_tuple(
+               number<ThreadM>{}, number<VectorM>{}
+           )),
+           make_unmerge_transform(make_tuple(
+               number<ThreadN>{}, number<VectorN>{}
+           ))
+       ),
+       make_tuple(sequence<0>{}, sequence<1>{}),           // from 2D
+       make_tuple(sequence<0, 2>{}, sequence<1, 3>{})     // to 4D: [TM,TN,VM,VN]
+   );
+   
+   // This creates the layout:
+   // - Dimension 0,1: Thread indices
+   // - Dimension 2,3: Vector indices within thread
+   // Enables coalesced memory access on GPU
+   // See :ref:`ck_tile_thread_mapping` for thread mapping details
+
+Common Transform Chains
+-----------------------
+
+CK Tile provides several common transform chain patterns used throughout GPU kernels:
+
+**Padding for Convolution**
+
+.. code-block:: cpp
+
+   auto padded = transform_tensor_descriptor(
+       input, 
+       make_tuple(make_pad_transform(H, pad_h, pad_h),
+                  make_pad_transform(W, pad_w, pad_w)),
+       make_tuple(sequence<0>{}, sequence<1>{}),
+       make_tuple(sequence<0>{}, sequence<1>{})
+   );
+
+**Dimension Merging for GEMM**
+
+.. code-block:: cpp
+
+   auto merged = transform_tensor_descriptor(
+       input,
+       make_tuple(make_merge_transform(make_tuple(M, K))),
+       make_tuple(sequence<0, 1>{}),
+       make_tuple(sequence<0>{})
+   );
+
+For complete GEMM optimization strategies, see :ref:`ck_tile_gemm_optimization`.
+
+**Broadcasting for Elementwise Operations**
+
+.. code-block:: cpp
+
+   auto broadcast = transform_tensor_descriptor(
+       scalar,
+       make_tuple(make_replicate_transform(make_tuple(M, N))),
+       make_tuple(sequence<>{}),
+       make_tuple(sequence<0, 1>{})
+   );
+
+Key Concepts Summary
+--------------------
+
+TensorAdaptors are the coordination layer that makes complex tensor operations possible:
+
+- **Identity Adaptor**: Starting point for building transformations
+- **Transpose Adaptor**: Dimension reordering with permutation patterns
+- **Single-Stage Adaptors**: Custom transform chains with precise control
+- **Chained Adaptors**: Complex multi-stage transformation pipelines
+- **Transform Addition**: Extending existing adaptors with new transforms
+
+Core concepts to remember:
+
+- **Bottom/Top Dimensions**: Input and output coordinate spaces
+- **Hidden Dimensions**: Internal coordinate mappings between transforms
+- **Transform Chains**: Sequential application of multiple transforms
+- **Coordinate Transformation**: Bidirectional mapping between coordinate spaces
+- **Nested Transforms**: Complex multi-level transformation hierarchies
+
+Key C++ Patterns in Composable Kernel
+--------------------------------------
+
+1. **Descriptor-Based Adaptors**: In CK, adaptors are typically embedded within :ref:`tensor descriptors <ck_tile_descriptors>` rather than created separately
+2. **Compile-Time Optimization**: All transformations are resolved at compile time for zero overhead
+3. **Type Safety**: Template metaprogramming ensures coordinate transformations are type-safe
+4. **GPU Optimization**: Transform chains are designed for efficient GPU memory access patterns. See :ref:`ck_tile_lds_bank_conflicts` for LDS optimization.
+
+TensorAdaptors bridge the gap between low-level transforms and high-level tensor operations, providing the flexibility to create advanced data layouts and access patterns that are essential for efficient GPU computing. They build upon the foundation of :ref:`BufferViews <ck_tile_buffer_views>` and :ref:`TensorViews <ck_tile_tensor_views>` to provide complex transformation capabilities.
+
+Next Steps
+----------
+
+- :ref:`ck_tile_descriptors` - How adaptors combine with element space to form complete tensor descriptors
+- :ref:`ck_tile_transforms` - Individual transform types and their properties
+- :ref:`ck_tile_tile_window` - How adaptors enable efficient data loading patterns
+- :ref:`ck_tile_space_filling_curve` - Advanced coordinate mapping techniques for cache optimization
+- :ref:`ck_tile_static_distributed_tensor` - How adaptors help manage distributed tensor storage
--- a/docs/conceptual/ck_tile/buffer_views.rst
+++ b/docs/conceptual/ck_tile/buffer_views.rst
@@ -0,0 +1,443 @@
+.. meta::
+  :description: Composable Kernel CK Tile buffer views
+  :keywords: composable kernel, CK, CK Tile, ROCm, API, buffer view, raw memory
+
+.. _ck_tile_buffer_views:
+
+CK Tile buffer view
+=======================
+
+Buffer view is an abstraction that provides structured access to memory. The ``buffer_view`` class is exposed in ``include/ck_tile/core/tensor/buffer_view.hpp``.
+
+Buffer view serves as the foundation for :ref:`ck_tile_tensor_views`. BufferView handles memory addressing and type safety, while TensorView builds upon this to add multi-dimensional coordinates (shape and strides).
+
+
+Buffer view provides the following advantages:
+
+* A unified interface across global, shared, and register memory
+* Address spaces encoded in types, taking advantage of compile-time type checking
+* Configurable handling of invalid values, out-of-bounds operations, and conditional access patterns
+* Atomic operations for parallel algorithms
+* AMD GPU-specific optimizations 
+* Automatic application of appropriate memory ordering constraints and cache control directives based on the target address space and operation type
+
+
+[TO DO: do we want to say more about these items? There wasn't a lot of detail in the original text, so I put them in a list for now]
+
+
+
+Address Space Usage Patterns
+----------------------------
+
+[TO DO: explain in words what the diagram shows]
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart TB
+             subgraph CF ["Compute Flow"]
+                 direction LR
+                 GM1["Global Memory<br/>Input Data"] --> LDS["LDS<br/>Tile Cache"]
+                 LDS --> VGPR["VGPR<br/>Working Set"]
+                 VGPR --> Compute["Compute<br/>Operations"]
+                 Compute --> VGPR
+                 VGPR --> LDS2["LDS<br/>Reduction"]
+                 LDS2 --> GM2["Global Memory<br/>Output Data"]
+             end
+   
+             subgraph UP ["Usage Pattern"]
+                 direction LR
+                 P1["1. Load tile from Global → LDS"]
+                 P2["2. Load working set LDS → VGPR"]
+                 P3["3. Compute in VGPR"]
+                 P4["4. Store results VGPR → LDS"]
+                 P5["5. Reduce in LDS"]
+                 P6["6. Write final LDS → Global"]
+   
+                 P1 --> P2 --> P3 --> P4 --> P5 --> P6
+             end
+   
+             CF ~~~ UP
+   
+             style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
+             style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+      
+      
+.. image:: diagrams/buffer_views_1.svg
+   :alt: Diagram
+   :align: center
+
+
+Basic Creation
+~~~~~~~~~~~~~~
+
+[TO DO: remove "modern C++ template metaprogramming" and "zero-overhead abstraction"]
+
+[TO DO: might want to move the implementation details to a separate section under "reference"]
+
+
+.. code-block:: cpp
+
+   #include <ck_tile/core/tensor/buffer_view.hpp>
+   #include <ck_tile/core/numeric/integral_constant.hpp>
+
+   // Create buffer view in C++
+   __device__ void example_buffer_creation()
+   {
+       // Static array in global memory
+       float data[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+       constexpr index_t buffer_size = 8;
+
+   // Create buffer view for global memory
+   // Template parameters: <AddressSpace>
+   auto buffer_view = make_buffer_view<address_space_enum::global>(
+       data,        // pointer to data
+       buffer_size  // number of elements
+   );
+   
+   
+   // Implementation detail: The actual C++ template is:
+   // template <address_space_enum BufferAddressSpace,
+   //           typename T,
+   //           typename BufferSizeType,
+   //           bool InvalidElementUseNumericalZeroValue = true,
+   //           amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default>
+   // struct buffer_view
+
+       // Alternative: Create with explicit type
+       using buffer_t = buffer_view<float*, address_space_enum::global>;
+       buffer_t explicit_buffer{data, number<buffer_size>{}};
+
+       // Access properties at compile time
+       constexpr auto size = buffer_view.get_buffer_size();
+       constexpr auto space = buffer_view.get_address_space();
+
+       // The buffer_view type encodes:
+       // - Data type (float)
+       // - Address space (global memory)
+       // - Size (known at compile time for optimization)
+       static_assert(size == 8, "Buffer size should be 8");
+       static_assert(space == address_space_enum::global, "Should be global memory");
+   }
+
+[TO DO: add details and remove unnecessary comments; the "implementation detail" comment can be moved out and either placed outside and explained further, or just removed, depending on what we want to do]
+
+[TO DO: might want to put this implementation detail in the reference section]
+
+Buffer view uses two modes, zero value mode and custom value mode, that can prevent serialization during bounds checking.
+
+Zero value mode returns zero without branching when an access falls outside the valid buffer range. This is useful in convolutions where out-of-bounds accesses correspond to zero-padding. 
+
+Custom value mode returns a custom value without branching when an access falls outside the valid buffer range. Custom value mode accommodates algorithms that require specific values for boundary conditions. 
+
+[TO DO: there were two examples of custom value mode that I removed. I removed them because unlike for zero value mode where the example was convolution, the example was vague in custom value. Is there a more specific example of where custom value would be used?]
+
+.. code-block:: cpp
+
+   // Basic buffer view creation with automatic zero for invalid elements
+   void basic_creation_example() {
+       // Create data array
+       constexpr size_t buffer_size = 8;
+       float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+       
+       // Create global memory buffer view
+       auto buffer_view = make_buffer_view<address_space_enum::global>(data, buffer_size);
+   }
+
+   // Custom invalid value mode
+   void custom_invalid_value_example() {
+       constexpr size_t buffer_size = 8;
+       float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+       float custom_invalid = 13.0f;
+       
+       // Create buffer view with custom invalid value
+       auto buffer_view = make_buffer_view<address_space_enum::global>(
+           data, buffer_size, custom_invalid);
+   }
+
+
+When ``InvalidElementUseNumericalZeroValue`` is set to true, the system uses zero value mode for out of bounds checking. When ``InvalidElementUseNumericalZeroValue`` is set to false, custom value mode is used. Zero value mode is used by default.
+
+.. note:: 
+    
+    Zero or custom invalid value is only returned for complete invalid values or out of bound access, for example when the first address of the vector is invalid. Partial out of bounds access during vector reads will not return useful results. 
+
+.. code-block:: cpp
+
+    // Create data array
+    constexpr size_t buffer_size = 8;
+    float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    float custom_invalid = 13.0f;
+       
+    // Create global memory buffer view with zero invalid value mode (default)
+    auto buffer_view = make_buffer_view<address_space_enum::global>(data, buffer_size, custom_invalid);
+       
+    // Invalid element access with is_valid_element=false
+    // Returns custom_invalid due to custom invalid value mode
+    auto invalid_value = buffer_view.template get<float>(0, 0, false);
+    printf("Invalid element: %.1f\n", invalid_value.get(0));
+       
+    // Out of bounds access - AMD buffer addressing handles bounds checking
+    // Will return custom_invalid when accessing beyond buffer_size
+    auto oob_value = buffer_view.template get<float>(0, 100, true);
+    printf("Out of bounds: %.1f\n", oob_value.get(0));
+   
+
+
+   
+
+Get Operations
+--------------
+
+[TO DO: might want to put this implementation detail in the reference section]
+
+The signature for the ``buffer_view`` ``get()`` takes four parameters:
+
+``i``: the primary offset into the buffer expressed in terms of elements of type T rather than raw bytes. 
+
+``linear_offset``: [TO DO: what is this?]
+
+``is_valid_element``: [TO DO: what is this?]
+
+[TO DO: the last param, that's the out of bounds handling, yes?
+.. code:: cpp
+
+    get(index_t i,
+        index_t linear_offset,
+        bool is_valid_element,
+        bool_constant<oob_conditional_check> = {})
+
+
+[TO DO: need some context around the code]
+
+[TO DO: code chunks need to have detail and explanation so that the reader can see what they're trying to demonstrate.]
+
+
+.. code-block:: cpp
+
+    // Create buffer view
+    float data[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    auto buffer_view = make_buffer_view<address_space_enum::global>(data, 8);
+
+    // Simple get - compile-time bounds checking when possible
+    auto value_buf = buffer_view.template get<float>(0,1,true); //get the buffer from the buffer view
+    float value = value_buf.get(0); //get the value from the buffer
+
+       // Get with valid flag - branchless conditional access
+       bool valid_flag = false;
+       value_buf = buffer_view.template get<float>(0,1,valid_flag);
+       value = value_buf.get(0);
+       // Returns 0 valid_flag is false
+
+       // vectorized get
+       using float2 = ext_vector_t<float, 2>;
+       auto vector_buf = buffer_view.template get<float2>(0, 0, true);
+       // Loads 2 floats in a single instruction
+       float val1 = vector_buf.get(0);
+       float val2 = vector_buf.get(1);
+   }
+
+``ext_vector_t<float, N>`` enables compile-time selection of optimal load and store instructions that can transfer multiple data elements in a single memory transaction. 
+
+[TO DO: what is it actually doing? When does one use scalars vs vectors? Is it application specific or are there ]
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Scalar Access (4 instructions)"
+                 S1["Load float[0]"] --> R1["Register 1"]
+                 S2["Load float[1]"] --> R2["Register 2"]
+                 S3["Load float[2]"] --> R3["Register 3"]
+                 S4["Load float[3]"] --> R4["Register 4"]
+             end
+   
+             subgraph "Vectorized Access (1 instruction)"
+                 V1["Load float4[0]"] --> VR["Vector Register<br/>(4 floats)"]
+             end
+   
+             subgraph "Performance Impact"
+                 Perf["4x fewer instructions<br/>Better memory bandwidth<br/>Reduced latency"]
+             end
+   
+             R1 & R2 & R3 & R4 --> Perf
+             VR --> Perf
+   
+             style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/buffer_views_2.svg
+   :alt: Diagram
+   :align: center
+
+Understanding BufferView Indexing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+[TO DO: an explanation of the diagram is needed]
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart LR
+             subgraph "Input Parameters"
+                 Offset["Offset<br/>(e.g., 5)"]
+                 ValidFlag["Valid Flag<br/>(optional)"]
+             end
+   
+             subgraph "Processing"
+                 BoundsCheck{{"Bounds Check<br/>offset < buffer_size?"}}
+                 FlagCheck{{"Flag Check<br/>valid_flag == True?"}}
+                 Access["Access Memory<br/>buffer[offset]"]
+             end
+   
+             subgraph "Output"
+                 ValidResult["Valid Result<br/>Return value"]
+                 Invalid["Invalid Result<br/>Return 0 or default"]
+             end
+   
+             Offset --> BoundsCheck
+             ValidFlag --> FlagCheck
+   
+             BoundsCheck -->|Yes| FlagCheck
+             BoundsCheck -->|No| Invalid
+   
+             FlagCheck -->|Yes| Access
+             FlagCheck -->|No| Invalid
+   
+             Access --> ValidResult
+   
+             style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/buffer_views_3.svg
+   :alt: Diagram
+   :align: center
+   
+   
+
+Update Operations
+-----------------
+
+Update operations modify the buffer content. The ``set()`` method writes a value to a specific location.
+
+.. code-block:: cpp
+
+   void scalar_set_operations_example() {
+           
+       // Create data array
+       constexpr size_t buffer_size = 8;
+       float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+       
+       // Create global memory buffer view
+       auto buffer_view = make_buffer_view<address_space_enum::global>(data, buffer_size);
+       
+       // Basic set: set<T>(i, linear_offset, is_valid_element, value)
+       // Sets element at position i + linear_offset = 0 + 2 = 2
+       buffer_view.template set<float>(0, 2, true, 99.0f);
+       
+       // Invalid write with is_valid_element=false (ignored)
+       buffer_view.template set<float>(0, 3, false, 777.0f);
+       
+       // Out of bounds write - handled safely by AMD buffer addressing
+       buffer_view.template set<float>(0, 100, true, 555.0f);
+
+       // Vector set
+       using float2 = ext_vector_t<float, 2>;
+       float2 pair_values{100.0f, 200.0f};
+       buffer_view.template set<float2>(0, 5, true, pair_values);
+   }
+
+Atomic Operations
+-----------------
+
+[TO DO: this needs information]
+
+Atomic vs Non-Atomic Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Non-Atomic Operation (Race Condition)"
+                 NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
+                 NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
+                 NA2 --> NA5["Thread 1: Write 15"]
+                 NA4 --> NA6["Thread 2: Write 13"]
+                 NA5 & NA6 --> NA7["Final value: 13 ❌<br/>(Lost update from Thread 1)"]
+             end
+   
+             subgraph "Atomic Operation (Thread-Safe)"
+                 A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures<br/>serialization"]
+                 A3["Thread 2: atomic_add(3)"] --> A2
+                 A2 --> A4["Final value: 18 ✓<br/>(Both updates applied)"]
+             end
+   
+             style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/buffer_views_4.svg
+   :alt: Diagram
+   :align: center
+
+C++ Atomic Operations
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: cpp
+
+   __device__ void example_atomic_operations()
+   {
+       // Shared memory for workgroup-level reductions
+       __shared__ float shared_sum[256];
+       auto shared_buffer_view = make_buffer_view<address_space_enum::lds>(
+           shared_sum, 256
+       );
+
+       // Initialize shared memory
+       if (threadIdx.x < 256) {
+           shared_buffer_view.template set<float>(threadIdx.x, 0.0f, true);
+       }
+       __syncthreads();
+
+       // Each thread atomically adds to shared memory
+       auto my_value = static_cast<float>(threadIdx.x);
+       shared_buffer_view.template update<memory_operation_enum::atomic_add, float>(0,0,true,my_value);
+       
+       // Atomic max for finding maximum value
+       shared_buffer_view.template update<memory_operation_enum::atomic_max, float>(0,1,true,my_value);
+       
+       __syncthreads();
+   }
--- a/docs/conceptual/ck_tile/cache_flushing_benchmarking.rst
+++ b/docs/conceptual/ck_tile/cache_flushing_benchmarking.rst
@@ -0,0 +1,390 @@
+===================================
+Cache Flushing for GPU Benchmarking
+===================================
+
+Overview
+========
+
+When benchmarking GPU kernels, accurate performance measurements require understanding and controlling cache behavior. Running a kernel multiple times with the same input data can lead to artificially fast results due to **cache hits**, where data and instructions are served from fast GPU cache rather than slow High Bandwidth Memory (HBM).
+
+Composable Kernel provides two complementary mechanisms to ensure realistic "cold cache" performance measurements:
+
+1. **Instruction Cache Flushing** - Invalidates cached GPU instructions
+2. **Rotating Memory Buffers** - Cycles through multiple data buffer copies at different memory addresses
+
+This document explains how these mechanisms work and how to use them in benchmarks.
+
+The Problem: Hot vs. Cold Cache
+================================
+
+GPU Memory Hierarchy
+--------------------
+
+GPUs have a multi-level cache hierarchy:
+
+.. code-block:: text
+
+    Fast → Slow, Small → Large
+
+    ┌─────────────────┐
+    │  Register File  │  ~1 cycle
+    ├─────────────────┤
+    │  L1 I-Cache     │  ~4 cycles   ← Instruction cache
+    ├─────────────────┤
+    │  L1 Data Cache  │  ~4 cycles   ← Data cache
+    ├─────────────────┤
+    │  L2 Cache       │  ~50 cycles
+    ├─────────────────┤
+    │  HBM (VRAM)     │  ~400 cycles
+    └─────────────────┘
+
+Cache Behavior Without Flushing
+--------------------------------
+
+When running a kernel repeatedly without cache management:
+
+.. code-block:: text
+
+    Run 1:  [Cache MISS]  → Fetch from HBM → 400 cycles → 5.2ms
+    Run 2:  [Cache HIT!]  → Read from L1/L2 → 4 cycles → 3.8ms ← Artificially fast!
+    Run 3:  [Cache HIT!]  → Read from L1/L2 → 4 cycles → 3.8ms
+    ...
+    Average: 4.1ms (misleading - not representative of real-world performance)
+
+This leads to:
+
+- ✗ Inflated performance numbers
+- ✗ Inconsistent timing between first and subsequent runs
+- ✗ Unfair comparisons between different kernels
+- ✗ Misleading optimization decisions
+
+Solution 1: Instruction Cache Flushing
+=======================================
+
+What is Instruction Cache?
+---------------------------
+
+The **instruction cache (I-cache)** is a small, fast memory on each GPU compute unit that stores recently executed machine code instructions. When a thread needs to execute an instruction:
+
+1. The **Program Counter (PC)** holds the instruction's memory address
+2. The GPU checks if that address exists in the I-cache
+3. **Cache HIT**: Instruction read instantly from I-cache (~4 cycles)
+4. **Cache MISS**: Instruction fetched from HBM (~400 cycles), then cached
+
+How It Works
+------------
+
+The GPU uses **address-based caching**: when you launch the same kernel multiple times, the kernel code resides at the same memory address, allowing the I-cache to serve cached instructions.
+
+.. code-block:: text
+
+    First Kernel Run:
+    PC = 0x7F8A0000  →  I-Cache lookup  →  MISS  →  Fetch from HBM  →  Cache it
+    
+    Second Kernel Run (without flush):
+    PC = 0x7F8A0000  →  I-Cache lookup  →  HIT!  →  Read from cache (fast!)
+    
+    Second Kernel Run (with flush):
+    PC = 0x7F8A0000  →  I-Cache lookup  →  MISS  →  Fetch from HBM again
+
+The ``flush_icache()`` Function
+--------------------------------
+
+Located in ``include/ck_tile/host/flush_icache.hpp``:
+
+.. code-block:: cpp
+
+    namespace ck_tile {
+    // GPU kernel to invalidate instruction cache for accurate benchmarking.
+    static __global__ void flush_cache()
+    {
+        asm __volatile__("s_icache_inv \n\t"    // Invalidate I-cache
+                         "s_nop 0 \n\t"          // Wait cycles (16 NOPs)
+                         "s_nop 0 \n\t"
+                         // ... 14 more NOPs
+                         "s_nop 0 \n\t" ::
+                             :);
+    }
+    }
+
+**Key Components:**
+
+- ``s_icache_inv``: AMD GPU instruction that invalidates the L1 instruction cache on the current compute unit
+- ``s_nop 0`` (×16): No-operation instructions (NOPs) that create a 16-cycle delay to ensure cache invalidation completes before the kernel exits
+
+**Why 16 NOPs?**
+
+The ``s_icache_inv`` instruction is **asynchronous**: it initiates cache invalidation but doesn't wait for completion. Without the NOPs, the kernel might exit before the flush finishes, leading to race conditions and incomplete cache invalidation.
+
+Launching the Flush Kernel
+---------------------------
+
+From ``include/ck_tile/host/rotating_buffers.hpp``:
+
+.. code-block:: cpp
+
+    inline void flush_icache()
+    {
+        hipDeviceProp_t deviceProps;
+        HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+        
+        // Over-provision blocks to ensure all CUs execute the flush instruction.
+        // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
+        // 60x over-provisioning provides statistical certainty that every CU gets at least one block.
+        constexpr int32_t blocks_per_cu = 60;
+        int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
+
+        ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
+        HIP_CHECK_ERROR(hipGetLastError());
+    }
+
+**Why 60× Over-provisioning?**
+
+The I-cache is **per-compute-unit** (CU). To flush all CUs, we must ensure every CU executes at least one instance of ``s_icache_inv``. 
+
+- Launching exactly 1 block per CU doesn't guarantee 1:1 mapping due to GPU scheduler behavior
+- Launching 60 blocks per CU provides statistical certainty that every CU receives work
+- For a 120-CU GPU: 120 × 60 = 7,200 blocks × 64 threads = 460,800 total threads
+
+This ensures comprehensive instruction cache flushing across all compute units.
+
+Solution 2: Rotating Memory Buffers
+====================================
+
+What is Data Cache?
+-------------------
+
+While I-cache stores instructions, **data cache** (L1 data, L2) stores matrix data (inputs A, B and output C). When a kernel reads the same matrix repeatedly, the data is served from cache rather than HBM.
+
+The RotatingMemWrapper Struct
+------------------------------
+
+Located in ``include/ck_tile/host/rotating_buffers.hpp``:
+
+.. code-block:: cpp
+
+    template <typename ADataType, typename BDataType>
+    struct RotatingMemWrapper
+    {
+        RotatingMemWrapper(const void* a_ptr_,
+                          const void* b_ptr_,
+                          std::size_t rotating_count_,
+                          std::size_t size_a_,
+                          std::size_t size_b_);
+        
+        void Next();  // Rotate to next buffer copy
+        ~RotatingMemWrapper() noexcept;  // Cleanup
+    };
+
+**Purpose**: Prevents data cache reuse by cycling through multiple copies of input matrices at different memory addresses.
+
+How It Works
+------------
+
+**Constructor: Create Buffer Copies**
+
+.. code-block:: cpp
+
+    RotatingMemWrapper(a_ptr, b_ptr, rotating_count=3, size_a, size_b)
+    {
+        // Store original buffer pointers as first entry
+        p_a_grids.push_back(a_ptr);
+        p_b_grids.push_back(b_ptr);
+        
+        // Create (rotating_count - 1) additional copies at different memory addresses
+        for(size_t i = 1; i < rotating_count; i++)
+        {
+            void* pADeviceBuf;
+            hipMalloc(&pADeviceBuf, size_a);
+            hipMemcpy(pADeviceBuf, p_a_grids[0], size_a, hipMemcpyDeviceToDevice);
+            p_a_grids.push_back(pADeviceBuf);
+            
+            // Same for B matrix...
+        }
+    }
+
+Result:
+
+.. code-block:: text
+
+    GPU Memory:
+    ┌─────────────────────────┐
+    │ Matrix A (original)     │  Address: 0x1000
+    │ Matrix A (copy 1)       │  Address: 0x2000
+    │ Matrix A (copy 2)       │  Address: 0x3000
+    │ Matrix B (original)     │  Address: 0x4000
+    │ Matrix B (copy 1)       │  Address: 0x5000
+    │ Matrix B (copy 2)       │  Address: 0x6000
+    └─────────────────────────┘
+
+**Next(): Rotate to Next Buffer**
+
+.. code-block:: cpp
+
+    void Next()
+    {
+        if(rotating_count > 1)
+        {
+            std::size_t idx = iter++ % rotating_count;  // Cycle: 0,1,2,0,1,2,...
+            a_ptr = p_a_grids[idx];
+            b_ptr = p_b_grids[idx];
+        }
+    }
+
+Usage in benchmarking loop:
+
+.. code-block:: text
+
+    Iteration 1:  Next() → Use buffers at 0x1000, 0x4000 → Kernel reads → Cache miss
+    Iteration 2:  Next() → Use buffers at 0x2000, 0x5000 → Kernel reads → Cache miss
+    Iteration 3:  Next() → Use buffers at 0x3000, 0x6000 → Kernel reads → Cache miss
+    Iteration 4:  Next() → Use buffers at 0x1000, 0x4000 → Kernel reads → Cache miss
+    ...
+
+By the time the buffers cycle back to the first copy, the cache has likely evicted the old data.
+
+**Destructor: Cleanup**
+
+.. code-block:: cpp
+
+    ~RotatingMemWrapper() noexcept
+    {
+        // Restore original buffer pointers
+        a_ptr = p_a_grids[0];
+        b_ptr = p_b_grids[0];
+
+        // Free extra buffer copies (index 0 is original, don't free it)
+        for(size_t i = 1; i < rotating_count; i++)
+        {
+            hipFree(p_a_grids[i]);
+            hipFree(p_b_grids[i]);
+        }
+    }
+
+Using Cache Flushing in Practice
+=================================
+
+Command Line Argument
+---------------------
+
+The ``flush_cache`` command-line argument controls whether cache flushing is enabled:
+
+.. code-block:: bash
+
+    # Enable cache flushing (cold cache benchmarking)
+    ./gemm_example --flush_cache=1 --rotating_count=3
+    
+    # Disable cache flushing (hot cache benchmarking)
+    ./gemm_example --flush_cache=0
+
+In ``run_gemm_quant_example.inc``:
+
+.. code-block:: cpp
+
+    bool flush_cache = arg_parser.get_bool("flush_cache");
+    int rotating_count = arg_parser.get_int("rotating_count");
+    
+    // Pass to stream_config
+    ck_tile::stream_config{
+        nullptr,      // stream
+        true,         // time_kernel
+        1,            // log_level
+        n_warmup,     // cold_niters (warmup iterations)
+        n_repeat,     // nrepeat (timed iterations)
+        true,         // is_gpu_timer
+        flush_cache,  // flush_cache_ ← Controls cache flushing
+        rotating_count // rotating_count_ ← Number of buffer copies
+    }
+
+Integration with Timing Loop
+-----------------------------
+
+The ``launch_kernel_time_mask`` function integrates both mechanisms:
+
+.. code-block:: cpp
+
+    // From include/ck_tile/host/kernel_launch.hpp
+    template <typename PreprocessFunc, typename... Callables>
+    float launch_kernel_time_mask(const stream_config& s, 
+                                   PreprocessFunc preprocess,
+                                   Callables&&... callables)
+    {
+        // Timing loop (simplified)
+        for(int i = 0; i < s.nrepeat_; i++)
+        {
+            preprocess();      // 1. Flush I-cache + rotate buffers
+            callables_func();  // 2. Launch kernel
+        }
+        
+        return average_time;
+    }
+
+Complete Example
+----------------
+
+From ``example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc``:
+
+.. code-block:: cpp
+
+    // Setup rotating memory wrapper
+    RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+        a_ptr, b_ptr, rotating_count, size_a, size_b);
+    
+    // Define preprocessing: flush I-cache + rotate buffers
+    auto preprocess = [&]() {
+        if(flush_cache) {
+            flush_icache();      // Invalidate instruction cache
+            rotating_mem.Next(); // Switch to next buffer copy
+        }
+    };
+    
+    // Define kernel launch
+    auto kernel_launch = [&]() {
+        gemm_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, M, N, K);
+    };
+    
+    // Benchmark with cache control
+    float avg_time = launch_kernel_time_mask(
+        stream_config,   // Config with flush_cache and rotating_count
+        preprocess,      // Flush + rotate before each iteration
+        kernel_launch    // Kernel to benchmark
+    );
+
+Execution Flow
+--------------
+
+With ``flush_cache=true`` and ``rotating_count=3``, ``nrepeat=100``:
+
+.. code-block:: text
+
+    Warmup Phase (n_warmup iterations):
+      - Run kernel without timing
+      - Prime GPU, warm up scheduler
+    
+    Timed Phase (100 iterations):
+      Iteration 1:  flush_icache() → rotating_mem.Next() → Use buffer copy 0 → kernel() → Measure
+      Iteration 2:  flush_icache() → rotating_mem.Next() → Use buffer copy 1 → kernel() → Measure
+      Iteration 3:  flush_icache() → rotating_mem.Next() → Use buffer copy 2 → kernel() → Measure
+      Iteration 4:  flush_icache() → rotating_mem.Next() → Use buffer copy 0 → kernel() → Measure
+      ...
+      Iteration 100: flush_icache() → rotating_mem.Next() → Use buffer copy 1 → kernel() → Measure
+    
+    Return: Average time per iteration (excluding preprocess overhead)
+
+References
+==========
+
+Related Files
+-------------
+
+- ``include/ck_tile/host/flush_icache.hpp`` - I-cache flush kernel implementation
+- ``include/ck_tile/host/rotating_buffers.hpp`` - RotatingMemWrapper implementation
+- ``include/ck_tile/host/kernel_launch.hpp`` - Timing loop integration
+
+Conclusion
+==========
+
+Accurate GPU kernel benchmarking requires careful control of cache behavior. The combination of **instruction cache flushing** (``flush_icache``) and **rotating memory buffers** (``RotatingMemWrapper``) ensures realistic "cold cache" performance measurements that represent real-world application behavior.
+
+By understanding and utilizing these mechanisms through the ``flush_cache`` command-line argument, you can obtain trustworthy performance data for optimization decisions and fair kernel comparisons.
+
--- a/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
+++ b/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Script to convert all mermaid diagrams in CK Tile docs to SVGs.
+This script:
+1. Finds all mermaid blocks in RST files
+2. Converts them to SVG using mmdc
+3. Updates RST files to use SVG images with commented mermaid source
+"""
+
+import os
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+
+# Configuration
+DOCS_DIR = Path(__file__).parent
+DIAGRAMS_DIR = DOCS_DIR / "diagrams"
+RST_FILES = [
+    "convolution_example.rst",
+    "encoding_internals.rst",
+    "lds_index_swapping.rst",
+    "space_filling_curve.rst",
+    "sweep_tile.rst",
+    "tensor_coordinates.rst",
+    "thread_mapping.rst",
+    "static_distributed_tensor.rst",
+    "load_store_traits.rst",
+    "tile_window.rst",
+    "transforms.rst",
+    "descriptors.rst",
+    "coordinate_movement.rst",
+    "adaptors.rst",
+    "introduction_motivation.rst",
+    "buffer_views.rst",
+    "tensor_views.rst",
+    "coordinate_systems.rst",
+    "tile_distribution.rst",
+]
+
+# Pattern to find mermaid blocks (can be indented with 3 spaces for commented blocks)
+MERMAID_PATTERN = re.compile(
+    r"^(?:   )?\.\. mermaid::\s*\n((?:(?:\n|   .*))*)", re.MULTILINE
+)
+
+
+def extract_mermaid_content(block):
+    """Extract the actual mermaid code from the block, removing RST indentation."""
+    lines = block.split("\n")
+    # Remove the leading spaces (RST indentation)
+    content_lines = []
+    for line in lines:
+        if line.startswith("   "):
+            content_lines.append(line[3:])  # Remove 3 spaces
+        elif line.strip() == "":
+            content_lines.append("")
+    return "\n".join(content_lines).strip()
+
+
+def generate_diagram_name(file_path, diagram_index, total_in_file):
+    """Generate a descriptive name for the diagram."""
+    base_name = file_path.stem
+    if total_in_file == 1:
+        return f"{base_name}.svg"
+    else:
+        return f"{base_name}_{diagram_index + 1}.svg"
+
+
+def convert_mermaid_to_svg(mermaid_code, output_path):
+    """Convert mermaid code to SVG using mmdc."""
+    # Create a temporary file for the mermaid code
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".mmd", delete=False, encoding="utf-8"
+    ) as tmp:
+        tmp.write(mermaid_code)
+        tmp_path = tmp.name
+
+    try:
+        # Run mmdc to convert to SVG (use shell=True on Windows for .cmd files)
+        subprocess.run(
+            [
+                "mmdc",
+                "-i",
+                tmp_path,
+                "-o",
+                str(output_path),
+                "-t",
+                "neutral",
+                "-b",
+                "transparent",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+            shell=True,  # Required for Windows .cmd files
+        )
+        print(f"  ✓ Generated: {output_path.name}")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"  ✗ Error converting diagram: {e.stderr}")
+        return False
+    finally:
+        # Clean up temp file
+        os.unlink(tmp_path)
+
+
+def update_rst_file(file_path, diagrams_info):
+    """Update RST file to replace mermaid blocks with commented source + image reference."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Sort diagrams by position (reverse order to maintain positions)
+    diagrams_info.sort(key=lambda x: x["position"], reverse=True)
+
+    for info in diagrams_info:
+        # Find the mermaid block
+        match = info["match"]
+        start_pos = match.start()
+        end_pos = match.end()
+
+        # Create the replacement text
+        mermaid_block = match.group(0)
+
+        # Create commented mermaid block
+        commented_lines = [
+            ".. ",
+            "   Original mermaid diagram (edit here, then run update_diagrams.py)",
+            "   ",
+        ]
+        for line in mermaid_block.split("\n"):
+            commented_lines.append(f"   {line}")
+
+        # Add image reference
+        svg_rel_path = f"diagrams/{info['svg_name']}"
+        image_block = [
+            "",
+            f".. image:: {svg_rel_path}",
+            "   :alt: Diagram",
+            "   :align: center",
+            "",
+        ]
+
+        replacement = "\n".join(commented_lines + image_block)
+
+        # Replace in content
+        content = content[:start_pos] + replacement + content[end_pos:]
+
+    # Write back
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+    print(f"  ✓ Updated: {file_path.name}")
+
+
+def process_file(file_path):
+    """Process a single RST file."""
+    print(f"\nProcessing {file_path.name}...")
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Find all mermaid blocks
+    matches = list(MERMAID_PATTERN.finditer(content))
+
+    if not matches:
+        print("  No mermaid diagrams found.")
+        return
+
+    print(f"  Found {len(matches)} diagram(s)")
+
+    diagrams_info = []
+
+    # Process each mermaid block
+    for idx, match in enumerate(matches):
+        mermaid_content = extract_mermaid_content(match.group(1))
+        svg_name = generate_diagram_name(file_path, idx, len(matches))
+        svg_path = DIAGRAMS_DIR / svg_name
+
+        # Convert to SVG
+        if convert_mermaid_to_svg(mermaid_content, svg_path):
+            diagrams_info.append(
+                {"match": match, "svg_name": svg_name, "position": match.start()}
+            )
+
+    # Update the RST file
+    if diagrams_info:
+        update_rst_file(file_path, diagrams_info)
+
+
+def main():
+    """Main function."""
+    print("CK Tile Mermaid to SVG Converter")
+    print("=" * 50)
+
+    # Verify mmdc is available
+    try:
+        subprocess.run(
+            ["mmdc", "--version"], capture_output=True, check=True, shell=True
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("Error: mermaid-cli (mmdc) not found. Please install it:")
+        print("  npm install -g @mermaid-js/mermaid-cli")
+        return 1
+
+    # Ensure diagrams directory exists
+    DIAGRAMS_DIR.mkdir(parents=True, exist_ok=True)
+
+    # Process each file
+    for rst_file in RST_FILES:
+        file_path = DOCS_DIR / rst_file
+        if file_path.exists():
+            process_file(file_path)
+        else:
+            print(f"\n⚠ Warning: {rst_file} not found")
+
+    print("\n" + "=" * 50)
+    print("✓ Conversion complete!")
+    print(f"SVG files saved to: {DIAGRAMS_DIR}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/docs/conceptual/ck_tile/convert_raw_html_to_commented.py
+++ b/docs/conceptual/ck_tile/convert_raw_html_to_commented.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""Convert raw HTML mermaid blocks to commented format for SVG conversion."""
+
+import os
+import re
+
+
+def convert_raw_html_to_commented(content):
+    """Convert raw HTML mermaid blocks to commented mermaid format."""
+
+    # Pattern to match raw HTML mermaid blocks
+    pattern = r'\.\. raw:: html\n\n   <div class="mermaid"[^>]*>\n(.*?)\n   </div>'
+
+    def replace_block(match):
+        mermaid_code = match.group(1)
+        # The mermaid code in HTML has 3-space indentation, keep it
+        # but add 3 more spaces for .. mermaid:: indentation
+        mermaid_lines = mermaid_code.split("\n")
+        properly_indented = []
+        for line in mermaid_lines:
+            if line.strip():  # Non-empty line
+                # Line already has 3 spaces from HTML, add 3 more for mermaid block
+                properly_indented.append("   " + line)
+            else:
+                properly_indented.append("")
+
+        indented_code = "\n".join(properly_indented)
+
+        # Create commented format matching the expected pattern
+        commented = f""".. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+   .. mermaid::
+   
+{indented_code}
+   
+   
+"""
+        return commented
+
+    return re.sub(pattern, replace_block, content, flags=re.DOTALL)
+
+
+def main():
+    """Process files with raw HTML mermaid blocks."""
+
+    files_to_convert = [
+        "introduction_motivation.rst",
+        "buffer_views.rst",
+        "tensor_views.rst",
+        "coordinate_systems.rst",
+        "tile_distribution.rst",
+    ]
+
+    converted_files = []
+
+    for filename in files_to_convert:
+        if not os.path.exists(filename):
+            print(f"Skipping {filename} - not found")
+            continue
+
+        with open(filename, "r", encoding="utf-8") as f:
+            original = f.read()
+
+        converted = convert_raw_html_to_commented(original)
+
+        if converted != original:
+            with open(filename, "w", encoding="utf-8") as f:
+                f.write(converted)
+
+            blocks_converted = original.count(".. raw:: html")
+            converted_files.append((filename, blocks_converted))
+            print(f"✓ Converted {filename}: {blocks_converted} blocks")
+        else:
+            print(f"  {filename}: no raw HTML blocks found")
+
+    print("\n=== CONVERSION COMPLETE ===")
+    print(f"Files converted: {len(converted_files)}")
+    print(f"Total blocks: {sum(c for _, c in converted_files)}")
+    print("\nNext: Run convert_mermaid_to_svg.py to generate SVG files")
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/conceptual/ck_tile/convolution_example.rst
+++ b/docs/conceptual/ck_tile/convolution_example.rst
@@ -0,0 +1,567 @@
+.. meta::
+   :description: CK Tile convolution implementation example
+   :keywords: CK Tile, convolution, im2col, tensor descriptors, GPU optimization
+
+.. _ck_tile_convolution_example:
+
+*****************************************
+Convolution Implementation with CK Tile
+*****************************************
+
+Overview
+========
+
+This section covers how CK Tile's :ref:`tensor descriptor <ck_tile_descriptors>` system enables efficient convolution implementations on GPUs. Convolution operations are fundamental in deep learning, and understanding their optimization reveals how high-performance libraries achieve their efficiency. This section progresses from a naive implementation to an optimized approach using tensor descriptors, showing how they enable efficient memory access patterns for GPU acceleration.
+
+The key insight is that convolution can be transformed from a complex nested loop operation into a highly parallel matrix multiplication through the image to column (im2col) transformation. CK Tile's tensor descriptors provide the perfect abstraction for implementing this transformation efficiently without data duplication.
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Convolution Process"
+                 I["Input Image<br/>6×6"]
+                 K["Kernel<br/>3×3"]
+                 SW["Sliding Window<br/>Extract 3×3 patches"]
+                 DP["Dot Product<br/>Element-wise multiply & sum"]
+                 O["Output<br/>4×4"]
+             end
+             
+             subgraph "Im2col Optimization"
+                 W["Windows Matrix<br/>16×9<br/>(all patches)"]
+                 KF["Kernel Flattened<br/>9×1"]
+                 MM["Matrix Multiply<br/>W @ K"]
+                 OF["Output Flattened<br/>16×1"]
+             end
+             
+             I --> SW
+             K --> DP
+             SW --> DP
+             DP --> O
+             
+             SW --> W
+             K --> KF
+             W --> MM
+             KF --> MM
+             MM --> OF
+             OF --> O
+             
+             style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
+   
+   
+
+.. image:: diagrams/convolution_example.svg
+   :alt: Diagram
+   :align: center
+
+.. image:: diagrams/convolution_example.svg
+   :alt: Diagram
+   :align: center
+
+Understanding Sliding Windows
+=============================
+
+Before diving into convolution, it's crucial to understand how sliding windows work. In convolution, overlapping patches need to be extracted from the input image. Traditional approaches would copy these patches, but CK Tile uses :ref:`tensor descriptors <ck_tile_descriptors>` to create efficient :ref:`views <ck_tile_tensor_views>` without data duplication.
+
+Simple Tiling Example
+---------------------
+
+Non-overlapping tiles:
+
+.. code-block:: cpp
+
+    // Create a 6x6 matrix tiled into 2x2 blocks
+    template<typename DataType>
+    struct SimpleTiling {
+        static constexpr index_t kMatrixSize = 6;
+        static constexpr index_t kTileSize = 2;
+        static constexpr index_t kNumTiles = kMatrixSize / kTileSize;
+        
+        // Original matrix: shape=(6, 6), strides=(6, 1)
+        // Tiled view: shape=(3, 3, 2, 2), strides=(12, 2, 6, 1)
+        // See :ref:`ck_tile_descriptors` for descriptor details
+        using TileDescriptor = TensorDescriptor<
+            Sequence<kNumTiles, kNumTiles, kTileSize, kTileSize>,
+            Sequence<12, 2, 6, 1>
+        >;
+        
+        __device__ void demonstrate() {
+            // To move to next tile row: skip 2 matrix rows = 6 × 2 = 12
+            // To move to next tile col: skip 2 matrix cols = 1 × 2 = 2
+            // Within tile: use original strides (6, 1)
+        }
+    };
+
+The key insight is understanding **strides**, the number of elements to skip to move to the next element in each dimension. For non-overlapping tiles, we skip by ``tile_size`` in the outer dimensions.
+
+Overlapping Windows for Convolution
+------------------------------------
+
+For convolution, overlapping windows that slide by one element are needed:
+
+.. code-block:: cpp
+
+    // Extract 3x3 overlapping windows from a 6x6 image
+    template<typename DataType>
+    struct ConvolutionWindows {
+        static constexpr index_t H = 6;      // Image height
+        static constexpr index_t W = 6;      // Image width
+        static constexpr index_t K = 3;      // Kernel size
+        static constexpr index_t OutH = H - K + 1;  // Output height = 4
+        static constexpr index_t OutW = W - K + 1;  // Output width = 4
+        
+        // Windows descriptor: shape=(4, 4, 3, 3), strides=(6, 1, 6, 1)
+        using WindowDescriptor = TensorDescriptor<
+            Sequence<OutH, OutW, K, K>,
+            Sequence<W, 1, W, 1>  // Key: stride by 1 for overlap!
+        >;
+        
+        __device__ DataType extract_window(const DataType* image, 
+                                           index_t out_i, index_t out_j,
+                                           index_t k_i, index_t k_j) {
+            WindowDescriptor desc;
+            index_t offset = desc.calculate_offset({out_i, out_j, k_i, k_j});
+            return image[offset];
+        }
+    };
+
+The stride pattern ``[W, 1, W, 1]`` creates sliding windows:
+
+- Moving one step in output row: jump ``W`` elements (one image row)
+- Moving one step in output col: jump ``1`` element (one image column)
+- Within each window: same strides to access the 3×3 patch
+
+Naive Convolution Implementation
+================================
+
+A straightforward implementation for reference:
+
+.. code-block:: cpp
+
+    template<typename DataType>
+    __global__ void naive_convolution_kernel(
+        const DataType* __restrict__ input,
+        const DataType* __restrict__ kernel,
+        DataType* __restrict__ output,
+        index_t H, index_t W, index_t K)
+    {
+        index_t out_h = H - K + 1;
+        index_t out_w = W - K + 1;
+        
+        // Each thread computes one output element
+        index_t out_i = blockIdx.y * blockDim.y + threadIdx.y;
+        index_t out_j = blockIdx.x * blockDim.x + threadIdx.x;
+        
+        if (out_i < out_h && out_j < out_w) {
+            DataType sum = 0;
+            
+            // Extract window and apply convolution
+            for (index_t ki = 0; ki < K; ++ki) {
+                for (index_t kj = 0; kj < K; ++kj) {
+                    index_t in_i = out_i + ki;
+                    index_t in_j = out_j + kj;
+                    sum += input[in_i * W + in_j] * kernel[ki * K + kj];
+                }
+            }
+            
+            output[out_i * out_w + out_j] = sum;
+        }
+    }
+
+This implementation directly follows the mathematical definition but has poor memory access patterns and limited parallelism within each output computation.
+
+Window Extraction with Tensor Descriptors
+=========================================
+
+CK Tile's tensor descriptors provide an clean way to extract convolution windows:
+
+.. code-block:: cpp
+
+    template<typename DataType, index_t H, index_t W, index_t K>
+    struct ConvolutionWindowExtractor {
+        static constexpr index_t OutH = H - K + 1;
+        static constexpr index_t OutW = W - K + 1;
+        
+        // Create tensor descriptor for all windows
+        using WindowsDescriptor = TensorDescriptor<
+            Sequence<OutH, OutW, K, K>,
+            Sequence<W, 1, W, 1>
+        >;
+        
+        __device__ void extract_all_windows(
+            const DataType* input,
+            DataType* windows_buffer)
+        {
+            WindowsDescriptor desc;
+            
+            // Extract all windows in parallel
+            index_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+            index_t total_elements = OutH * OutW * K * K;
+            
+            for (index_t i = tid; i < total_elements; i += gridDim.x * blockDim.x) {
+                // Convert linear index to 4D coordinates
+                index_t tmp = i;
+                index_t kj = tmp % K; tmp /= K;
+                index_t ki = tmp % K; tmp /= K;
+                index_t out_j = tmp % OutW; tmp /= OutW;
+                index_t out_i = tmp;
+                
+                // Calculate source offset using descriptor
+                index_t src_offset = desc.calculate_offset({out_i, out_j, ki, kj});
+                windows_buffer[i] = input[src_offset];
+            }
+        }
+    };
+
+The tensor descriptor automatically handles the complex indexing required for overlapping windows, making the code cleaner and less error-prone.
+
+Im2col Transformation
+=====================
+
+The im2col transformation converts the 4D windows tensor into a 2D matrix suitable for matrix multiplication. This is where CK Tile's :ref:`transformation system <ck_tile_transforms>` shines:
+
+.. code-block:: cpp
+
+    template<typename DataType, index_t OutH, index_t OutW, index_t K>
+    struct Im2colTransformer {
+        static constexpr index_t NumWindows = OutH * OutW;
+        static constexpr index_t PatchSize = K * K;
+        
+        // Step 1: Create 4D windows descriptor
+        using WindowsDescriptor = TensorDescriptor<
+            Sequence<OutH, OutW, K, K>,
+            Sequence<W, 1, W, 1>
+        >;
+        
+        // Step 2: Apply merge transforms to create 2D im2col layout
+        // See :ref:`ck_tile_transforms` for transform operations
+        using Im2colDescriptor = decltype(
+            transform_tensor_descriptor(
+                WindowsDescriptor{},
+                make_tuple(
+                    make_merge_transform(Sequence<OutH, OutW>{}),  // Merge spatial dims
+                    make_merge_transform(Sequence<K, K>{})         // Merge kernel dims
+                ),
+                Sequence<0, 1>{},  // Merge dimensions 0,1
+                Sequence<2, 3>{}   // Merge dimensions 2,3
+            )
+        );
+        
+        __device__ void create_im2col_matrix(
+            const DataType* input,
+            DataType* im2col_matrix)
+        {
+            Im2colDescriptor desc;
+            
+            // Each thread handles multiple elements
+            index_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+            index_t total_elements = NumWindows * PatchSize;
+            
+            for (index_t i = tid; i < total_elements; i += gridDim.x * blockDim.x) {
+                index_t window_idx = i / PatchSize;
+                index_t patch_idx = i % PatchSize;
+                
+                // Calculate source offset using merged descriptor
+                index_t src_offset = desc.calculate_offset({window_idx, patch_idx});
+                im2col_matrix[i] = input[src_offset];
+            }
+        }
+    };
+
+The transformation pipeline:
+1. Start with 4D tensor ``[OutH, OutW, K, K]``
+2. Merge spatial dimensions: ``[OutH, OutW] → NumWindows``
+3. Merge kernel dimensions: ``[K, K] → PatchSize``
+4. Result: 2D matrix ``[NumWindows, PatchSize]``
+
+Optimized Convolution Kernel
+============================
+
+Combining all components into an optimized convolution implementation:
+
+.. code-block:: cpp
+
+    template<typename DataType, 
+             index_t TileM, index_t TileN, index_t TileK,
+             index_t BlockM, index_t BlockN>
+    __global__ void optimized_convolution_kernel(
+        const DataType* __restrict__ input,
+        const DataType* __restrict__ kernel,
+        DataType* __restrict__ output,
+        index_t H, index_t W, index_t K)
+    {
+        constexpr index_t WarpSize = 32;
+        const index_t OutH = H - K + 1;
+        const index_t OutW = W - K + 1;
+        const index_t NumWindows = OutH * OutW;
+        const index_t PatchSize = K * K;
+        
+        // Create im2col descriptor for this image size
+        using Im2colDesc = TensorDescriptor<
+            Sequence<NumWindows, PatchSize>,
+            DynamicStrides  // Computed based on H, W, K
+        >;
+        
+        // Tile distribution for matrix multiplication
+        // See :ref:`ck_tile_tile_distribution` for details
+        using ATileDist = TileDistribution<
+            Sequence<TileM, TileK>,
+            Sequence<BlockM, 1>
+        >;
+        using BTileDist = TileDistribution<
+            Sequence<TileK, TileN>,
+            Sequence<1, BlockN>
+        >;
+        using CTileDist = TileDistribution<
+            Sequence<TileM, TileN>,
+            Sequence<BlockM, BlockN>
+        >;
+        
+        // Thread-local accumulator
+        // See :ref:`ck_tile_static_distributed_tensor`
+        StaticDistributedTensor<DataType, CTileDist> c_accumulator;
+        
+        // Initialize accumulator
+        #pragma unroll
+        for (index_t i = 0; i < c_accumulator.size(); ++i) {
+            c_accumulator[i] = 0;
+        }
+        
+        // Main GEMM loop over K dimension
+        for (index_t k_tile = 0; k_tile < PatchSize; k_tile += TileK) {
+            // Create tile windows for im2col matrix and kernel
+            // See :ref:`ck_tile_tile_window` for window operations
+            auto a_window = make_tile_window<ATileDist>(
+                input, Im2colDesc{H, W, K},
+                {blockIdx.y * TileM, k_tile}
+            );
+            
+            auto b_window = make_tile_window<BTileDist>(
+                kernel, TensorDescriptor<Sequence<PatchSize, 1>>{},
+                {k_tile, 0}
+            );
+            
+            // Load tiles - see :ref:`ck_tile_load_store_traits` for optimization
+            auto a_tile = a_window.load();
+            auto b_tile = b_window.load();
+            
+            // Synchronize after loads
+            __syncthreads();
+            
+            // Local matrix multiplication
+            #pragma unroll
+            for (index_t m = 0; m < TileM/BlockM; ++m) {
+                #pragma unroll
+                for (index_t n = 0; n < TileN/BlockN; ++n) {
+                    #pragma unroll
+                    for (index_t k = 0; k < TileK; ++k) {
+                        c_accumulator.at(m, n) += 
+                            a_tile.at(m, k) * b_tile.at(k, n);
+                    }
+                }
+            }
+        }
+        
+        // Store results back to global memory
+        auto c_window = make_tile_window<CTileDist>(
+            output, TensorDescriptor<Sequence<OutH, OutW>>{OutW, 1},
+            {blockIdx.y * TileM, blockIdx.x * TileN}
+        );
+        c_window.store(c_accumulator);
+    }
+
+Multi-Channel Convolution
+=========================
+
+Real-world convolutions involve multiple input and output channels. CK Tile handles this cleanly:
+
+.. code-block:: cpp
+
+    template<typename DataType,
+             index_t H, index_t W, 
+             index_t CIn, index_t COut,
+             index_t K>
+    struct MultiChannelConvolution {
+        static constexpr index_t OutH = H - K + 1;
+        static constexpr index_t OutW = W - K + 1;
+        static constexpr index_t NumWindows = OutH * OutW;
+        static constexpr index_t PatchSize = K * K * CIn;
+        
+        // 5D windows descriptor [OutH, OutW, K, K, CIn]
+        using Windows5D = TensorDescriptor<
+            Sequence<OutH, OutW, K, K, CIn>,
+            Sequence<W*CIn, CIn, W*CIn, CIn, 1>
+        >;
+        
+        // Im2col: [NumWindows, PatchSize]
+        using Im2colDesc = decltype(
+            transform_tensor_descriptor(
+                Windows5D{},
+                make_tuple(
+                    make_merge_transform(Sequence<OutH, OutW>{}),
+                    make_merge_transform(Sequence<K, K, CIn>{})
+                ),
+                Sequence<0, 1>{},
+                Sequence<2, 3, 4>{}
+            )
+        );
+        
+        // Filter layout: [K*K*CIn, COut]
+        using FilterDesc = TensorDescriptor<
+            Sequence<PatchSize, COut>,
+            Sequence<COut, 1>
+        >;
+        
+        __device__ void compute(
+            const DataType* input,    // [H, W, CIn]
+            const DataType* filters,  // [K, K, CIn, COut]
+            DataType* output)         // [OutH, OutW, COut]
+        {
+            // The convolution becomes a matrix multiplication:
+            // [NumWindows, PatchSize] @ [PatchSize, COut] = [NumWindows, COut]
+            // Then reshape to [OutH, OutW, COut]
+        }
+    };
+
+The multi-channel extension naturally follows from the single-channel case:
+
+- Input: ``[H, W, CIn]``
+- Filters: ``[K, K, CIn, COut]``
+- Im2col matrix: ``[NumWindows, K×K×CIn]``
+- Output: ``[OutH, OutW, COut]``
+
+Performance Optimizations
+=========================
+
+CK Tile enables several optimizations for convolution:
+
+**1. Memory Coalescing**
+
+.. code-block:: cpp
+
+    // Coalesced access pattern for im2col
+    template<index_t VectorSize>
+    __device__ void load_im2col_vectorized(
+        const float* input,
+        float* im2col_tile,
+        const Im2colDescriptor& desc)
+    {
+        using VectorType = vector_type_t<float, VectorSize>;
+        
+        // Load multiple elements per thread
+        index_t tid = threadIdx.x;
+        index_t stride = blockDim.x;
+        
+        for (index_t i = tid; i < NumElements; i += stride * VectorSize) {
+            VectorType vec = *reinterpret_cast<const VectorType*>(&input[i]);
+            *reinterpret_cast<VectorType*>(&im2col_tile[i]) = vec;
+        }
+    }
+
+**2. Shared Memory Tiling**
+
+.. code-block:: cpp
+
+    // Use shared memory for frequently accessed data
+    __shared__ float smem_a[TileM][TileK];
+    __shared__ float smem_b[TileK][TileN];
+    
+    // Collaborative loading with proper bank conflict avoidance
+    // See :ref:`ck_tile_lds_bank_conflicts` for optimization
+    auto load_tile_to_smem = [&](auto& window, float smem[][TileK]) {
+        #pragma unroll
+        for (index_t i = threadIdx.y; i < TileM; i += blockDim.y) {
+            #pragma unroll
+            for (index_t j = threadIdx.x; j < TileK; j += blockDim.x) {
+                smem[i][j] = window.at(i, j);
+            }
+        }
+    };
+
+**3. Register Blocking**
+
+.. code-block:: cpp
+
+    // Each thread computes multiple output elements
+    template<index_t RegM, index_t RegN>
+    struct RegisterBlock {
+        float c_reg[RegM][RegN];
+        
+        __device__ void compute(const float* a_smem, const float* b_smem) {
+            #pragma unroll
+            for (index_t k = 0; k < TileK; ++k) {
+                #pragma unroll
+                for (index_t m = 0; m < RegM; ++m) {
+                    #pragma unroll
+                    for (index_t n = 0; n < RegN; ++n) {
+                        c_reg[m][n] += a_smem[m] * b_smem[n];
+                    }
+                }
+            }
+        }
+    };
+
+Performance Characteristics
+===========================
+
+The tensor descriptor approach provides optimal performance characteristics:
+
+.. list-table:: Method Comparison
+   :header-rows: 1
+   :widths: 25 20 20 20 15
+
+   * - Method
+     - Memory Usage
+     - Parallelization
+     - GPU Efficiency
+     - Flexibility
+   * - Naive loops
+     - Low
+     - Poor
+     - Poor
+     - High
+   * - Direct im2col copy
+     - High
+     - Excellent
+     - Good
+     - Medium
+   * - Tensor descriptors
+     - Medium
+     - Excellent
+     - Excellent
+     - High
+   * - CK Tile optimized
+     - Low
+     - Excellent
+     - Excellent
+     - High
+
+Key advantages of the CK Tile approach:
+
+1. **Zero-copy views**: Tensor descriptors create logical views without data duplication
+2. **Compile-time optimization**: All indexing calculations resolve at compile time
+3. **Hardware-aware**: Automatic alignment and vectorization based on :ref:`architecture <ck_tile_gpu_basics>`
+4. **Composability**: Complex access patterns built from simple :ref:`transformations <ck_tile_transforms>`
+5. **Performance portability**: Same code optimizes differently for different GPUs
+
+Summary
+=======
+
+This example demonstrates how CK Tile transforms convolution from a memory-bound operation with poor parallelism into a compute-bound operation that utilizes GPU resources. The key insights are:
+
+- **Sliding windows** can be efficiently represented using tensor descriptors with appropriate strides
+- **Im2col transformation** converts convolution to matrix multiplication without data copies  
+- **Tile distribution** enables optimal work distribution across GPU threads (see :ref:`ck_tile_tile_distribution`)
+- **Multi-channel support** extends naturally through higher-dimensional descriptors
+- **Performance optimizations** like vectorization and shared memory are seamlessly integrated (see :ref:`ck_tile_gemm_optimization` for similar techniques)
+
+The tensor descriptor system provides a unified framework for these transformations, enabling automatic generation of efficient kernels for various convolution configurations and hardware architectures. This approach forms the foundation for production deep learning frameworks' convolution implementations.
--- a/docs/conceptual/ck_tile/coordinate_movement.rst
+++ b/docs/conceptual/ck_tile/coordinate_movement.rst
@@ -0,0 +1,532 @@
+.. meta::
+   :description: CK Tile advanced coordinate operations documentation
+   :keywords: CK Tile, coordinate movement, tensor coordinates, GPU programming
+
+.. _ck_tile_coordinate_movement:
+
+****************************
+Advanced Coordinate Movement
+****************************
+
+Overview
+========
+
+Advanced coordinate operations form the bridge between mathematical transformations and practical tensor manipulation in CK Tile. These operations enable efficient navigation through complex tensor layouts without recalculating entire transformation chains. Understanding coordinate movement is essential for implementing high-performance GPU kernels that traverse multi-dimensional data structures.
+
+The coordinate movement system provides two key abstractions: TensorCoordinate for descriptor-aware navigation and TensorAdaptorCoordinate for tracking positions through transformation chains. Together with movement functions, they enable advanced access patterns while maintaining optimal performance through incremental updates rather than full recalculation.
+
+For the mathematical foundations of coordinate systems, see :ref:`ck_tile_coordinate_systems`. For simpler coordinate concepts, see :ref:`ck_tile_tensor_coordinates`.
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Coordinate Movement System"
+                 TC["TensorCoordinate<br/>Position + Descriptor Context"]
+                 TAC["TensorAdaptorCoordinate<br/>Position + Transform Context"]
+                 MC["move_coordinate()<br/>Efficient Navigation"]
+             end
+             
+             subgraph "Movement Example"
+                 S["Start: [1,1]<br/>Offset: 5"]
+                 M1["Move [0,1]<br/>→ [1,2]<br/>Offset: 6"]
+                 M2["Move [1,0]<br/>→ [2,2]<br/>Offset: 10"]
+                 M3["Move [1,1]<br/>→ [3,3]<br/>Offset: 15"]
+             end
+             
+             TC --> MC
+             TAC --> MC
+             
+             S --> M1
+             M1 --> M2
+             M2 --> M3
+             
+             style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+      
+   
+   
+
+.. image:: diagrams/coordinate_movement.svg
+   :alt: Diagram
+   :align: center
+
+.. image:: diagrams/coordinate_movement.svg
+   :alt: Diagram
+   :align: center
+
+TensorCoordinate: Descriptor-Aware Navigation
+=============================================
+
+TensorCoordinate combines a multi-dimensional position with descriptor context to provide efficient offset calculation and validation. It caches transformation results to avoid redundant computations during navigation. This builds on the :ref:`ck_tile_descriptors` concepts for tensor specifications.
+
+Basic Structure
+---------------
+
+.. code-block:: cpp
+
+    template<typename TensorDescriptor>
+    class TensorCoordinate {
+    private:
+        MultiIndex top_index_;      // Position in top dimensions
+        MultiIndex hidden_index_;   // Cached transformation results
+        index_t offset_;           // Cached linear offset
+        
+    public:
+        // Create coordinate from descriptor and position
+        __host__ __device__ TensorCoordinate(
+            const TensorDescriptor& desc,
+            const MultiIndex& top_index)
+        {
+            top_index_ = top_index;
+            // Apply descriptor transforms to compute hidden indices
+            hidden_index_ = desc.calculate_bottom_index(top_index);
+            offset_ = desc.calculate_offset(top_index);
+        }
+        
+        // Access methods
+        __host__ __device__ const MultiIndex& get_index() const { 
+            return top_index_; 
+        }
+        
+        __host__ __device__ index_t get_offset() const { 
+            return offset_; 
+        }
+        
+        __host__ __device__ index_t ndim_hidden() const {
+            return hidden_index_.size();
+        }
+    };
+
+Creating and Using TensorCoordinate
+-----------------------------------
+
+.. code-block:: cpp
+
+    // Example: Navigate a 4x3 matrix with custom strides
+    template<typename DataType>
+    __device__ void demonstrate_tensor_coordinate() {
+        // Create descriptor for 4x3 matrix, row-major layout
+        using Desc = TensorDescriptor<
+            Sequence<4, 3>,    // Shape
+            Sequence<3, 1>     // Strides
+        >;
+        Desc desc;
+        
+        // Create coordinate at position [2, 1]
+        auto coord = make_tensor_coordinate(desc, make_multi_index(2, 1));
+        
+        // Access coordinate information
+        auto position = coord.get_index();        // [2, 1]
+        auto offset = coord.get_offset();         // 2*3 + 1 = 7
+        auto hidden_dims = coord.ndim_hidden();  // 0 (no hidden dims)
+        
+        // Use offset for memory access
+        DataType* tensor_data = ...;
+        DataType value = tensor_data[offset];
+    }
+
+Key Benefits
+------------
+
+1. **Context Preservation**: The coordinate maintains descriptor context for validation
+2. **Cached Calculations**: Transformation results are cached for efficiency
+3. **Type Safety**: Compile-time checking ensures coordinate-descriptor compatibility
+4. **Zero Overhead**: All operations resolve at compile time when possible
+
+
+TensorAdaptorCoordinate: Transform-Aware Tracking
+==================================================
+
+TensorAdaptorCoordinate extends the concept to track coordinates through transformation chains, maintaining both input (top) and output (bottom) positions. This leverages :ref:`ck_tile_adaptors` and :ref:`ck_tile_transforms` for complex coordinate mappings.
+
+Structure and Implementation
+----------------------------
+
+.. code-block:: cpp
+
+    template<typename TensorAdaptor>
+    class TensorAdaptorCoordinate {
+    private:
+        MultiIndex top_index_;      // Input position
+        MultiIndex bottom_index_;   // Output after transformations
+        MultiIndex hidden_index_;   // Intermediate results
+        
+    public:
+        // Create from adaptor and position
+        __host__ __device__ TensorAdaptorCoordinate(
+            const TensorAdaptor& adaptor,
+            const MultiIndex& top_index)
+        {
+            top_index_ = top_index;
+            // Apply adaptor transforms
+            bottom_index_ = adaptor.calculate_bottom_index(top_index);
+            // Cache intermediate results
+            hidden_index_ = adaptor.get_hidden_index(top_index);
+        }
+        
+        // Access transformed coordinates
+        __host__ __device__ const MultiIndex& get_top_index() const {
+            return top_index_;
+        }
+        
+        __host__ __device__ const MultiIndex& get_bottom_index() const {
+            return bottom_index_;
+        }
+    };
+
+Tracking Through Transformations
+--------------------------------
+
+.. code-block:: cpp
+
+    // Example: Track coordinates through transpose
+    template<typename DataType>
+    __device__ void demonstrate_adaptor_coordinate() {
+        // Create transpose adaptor (swap dimensions)
+        auto adaptor = make_transpose_adaptor<2>(Sequence<1, 0>{});
+        
+        // Create coordinate at [2, 3]
+        auto coord = make_tensor_adaptor_coordinate(
+            adaptor, 
+            make_multi_index(2, 3)
+        );
+        
+        // Track transformation
+        auto input_pos = coord.get_top_index();     // [2, 3]
+        auto output_pos = coord.get_bottom_index();  // [3, 2] (swapped)
+        
+        // Use for complex access patterns
+        DataType* src_data = ...;
+        DataType* dst_data = ...;
+        
+        // Read from transposed position
+        index_t src_offset = calculate_offset(output_pos);
+        DataType value = src_data[src_offset];
+    }
+
+Efficient Coordinate Movement
+=============================
+
+The ``move_tensor_coordinate`` function provides efficient navigation by updating coordinates incrementally rather than recreating them.
+
+Basic Movement Operations
+-------------------------
+
+.. code-block:: cpp
+
+    // Move tensor coordinate through descriptor
+    template<typename TensorDescriptor>
+    __host__ __device__ void move_tensor_coordinate(
+        const TensorDescriptor& desc,
+        TensorCoordinate<TensorDescriptor>& coord,
+        const MultiIndex& step)
+    {
+        // Update top index
+        coord.top_index_ += step;
+        
+        // Incrementally update cached values
+        // Only recalculate affected transformations
+        if (transformation_affects_movement(desc, step)) {
+            coord.hidden_index_ = desc.calculate_bottom_index(coord.top_index_);
+            coord.offset_ = desc.calculate_offset(coord.top_index_);
+        } else {
+            // Fast path: simple offset update
+            coord.offset_ += calculate_step_offset(desc, step);
+        }
+    }
+
+Practical Movement Patterns
+---------------------------
+
+.. code-block:: cpp
+
+    // Example: Efficient matrix traversal
+    template<typename DataType>
+    __global__ void matrix_traversal_kernel(
+        const DataType* input,
+        DataType* output,
+        index_t rows, index_t cols)
+    {
+        // Create descriptor for matrix
+        using Desc = TensorDescriptor<DynamicSequence, DynamicSequence>;
+        Desc desc(make_tuple(rows, cols), make_tuple(cols, 1));
+        
+        // Start at thread's assigned position
+        index_t start_row = blockIdx.y * blockDim.y + threadIdx.y;
+        index_t start_col = blockIdx.x * blockDim.x + threadIdx.x;
+        
+        auto coord = make_tensor_coordinate(
+            desc, 
+            make_multi_index(start_row, start_col)
+        );
+        
+        // Row-wise traversal pattern
+        for (index_t i = 0; i < 4; ++i) {
+            if (coord.get_index()[0] < rows) {
+                // Process current position
+                output[coord.get_offset()] = 
+                    process_value(input[coord.get_offset()]);
+                
+                // Move to next column
+                move_tensor_coordinate(desc, coord, make_multi_index(0, 1));
+                
+                // Wrap to next row if needed
+                if (coord.get_index()[1] >= cols) {
+                    move_tensor_coordinate(
+                        desc, coord, 
+                        make_multi_index(1, -cols)
+                    );
+                }
+            }
+        }
+    }
+
+Movement Through Adaptors
+-------------------------
+
+.. code-block:: cpp
+
+    // Move through adaptor transformations
+    template<typename TensorAdaptor>
+    __host__ __device__ MultiIndex move_tensor_adaptor_coordinate(
+        const TensorAdaptor& adaptor,
+        TensorAdaptorCoordinate<TensorAdaptor>& coord,
+        const MultiIndex& step)
+    {
+        // Update top index
+        MultiIndex old_top = coord.top_index_;
+        coord.top_index_ += step;
+        
+        // Calculate new bottom index
+        MultiIndex old_bottom = coord.bottom_index_;
+        coord.bottom_index_ = adaptor.calculate_bottom_index(coord.top_index_);
+        
+        // Return the change in bottom coordinates
+        return coord.bottom_index_ - old_bottom;
+    }
+
+Advanced Movement Patterns
+==========================
+
+Real-world applications use advanced movement patterns for optimal memory access. These patterns often relate to :ref:`ck_tile_tile_window` operations and :ref:`ck_tile_tile_distribution` concepts:
+
+Tiled Access Pattern
+--------------------
+
+.. code-block:: cpp
+
+    template<index_t TileM, index_t TileN>
+    __device__ void tiled_movement_pattern(
+        const float* input,
+        float* output,
+        index_t M, index_t N)
+    {
+        // Descriptor for full matrix
+        using MatrixDesc = TensorDescriptor<
+            DynamicSequence,
+            DynamicSequence
+        >;
+        MatrixDesc desc(make_tuple(M, N), make_tuple(N, 1));
+        
+        // Start at tile corner
+        index_t tile_row = blockIdx.y * TileM;
+        index_t tile_col = blockIdx.x * TileN;
+        
+        auto coord = make_tensor_coordinate(
+            desc,
+            make_multi_index(tile_row, tile_col)
+        );
+        
+        // Process tile with efficient movement
+        #pragma unroll
+        for (index_t i = 0; i < TileM; ++i) {
+            #pragma unroll
+            for (index_t j = 0; j < TileN; ++j) {
+                if (i == 0 && j == 0) {
+                    // First element - already positioned
+                } else if (j == 0) {
+                    // New row - move down and back to start column
+                    move_tensor_coordinate(
+                        desc, coord,
+                        make_multi_index(1, -(TileN-1))
+                    );
+                } else {
+                    // Same row - move right
+                    move_tensor_coordinate(
+                        desc, coord,
+                        make_multi_index(0, 1)
+                    );
+                }
+                
+                // Process element
+                output[coord.get_offset()] = 
+                    compute_value(input[coord.get_offset()]);
+            }
+        }
+    }
+
+Space-Filling Curve Movement
+----------------------------
+
+For more details on space-filling curves and their benefits, see :ref:`ck_tile_space_filling_curve`.
+
+.. code-block:: cpp
+
+    // Snake pattern for optimal cache usage
+    template<index_t BlockSize>
+    __device__ void snake_pattern_movement(
+        const float* input,
+        float* output,
+        index_t M, index_t N)
+    {
+        using Desc = TensorDescriptor<DynamicSequence, DynamicSequence>;
+        Desc desc(make_tuple(M, N), make_tuple(N, 1));
+        
+        auto coord = make_tensor_coordinate(
+            desc,
+            make_multi_index(threadIdx.y, threadIdx.x)
+        );
+        
+        // Snake through block
+        for (index_t row = 0; row < BlockSize; ++row) {
+            for (index_t col = 0; col < BlockSize; ++col) {
+                // Process current position
+                process_element(input, output, coord.get_offset());
+                
+                // Snake movement pattern
+                if (row % 2 == 0) {
+                    // Even rows: move right
+                    if (col < BlockSize - 1) {
+                        move_tensor_coordinate(
+                            desc, coord, make_multi_index(0, 1)
+                        );
+                    }
+                } else {
+                    // Odd rows: move left
+                    if (col < BlockSize - 1) {
+                        move_tensor_coordinate(
+                            desc, coord, make_multi_index(0, -1)
+                        );
+                    }
+                }
+            }
+            
+            // Move to next row
+            if (row < BlockSize - 1) {
+                move_tensor_coordinate(
+                    desc, coord, make_multi_index(1, 0)
+                );
+            }
+        }
+    }
+
+Performance Considerations
+=================================== 
+
+Efficient coordinate movement is critical for GPU performance. See :ref:`ck_tile_gpu_basics` for hardware details.
+
+**1. Incremental Updates**
+
+.. code-block:: cpp
+
+    // Inefficient: recreate coordinate
+    for (index_t i = 0; i < N; ++i) {
+        auto coord = make_tensor_coordinate(desc, make_multi_index(i, j));
+        process(data[coord.get_offset()]);
+    }
+    
+    // Efficient: incremental movement
+    auto coord = make_tensor_coordinate(desc, make_multi_index(0, j));
+    for (index_t i = 0; i < N; ++i) {
+        process(data[coord.get_offset()]);
+        move_tensor_coordinate(desc, coord, make_multi_index(1, 0));
+    }
+
+**2. Movement Caching**
+
+.. code-block:: cpp
+
+    // Cache frequently used movements
+    template<typename Desc>
+    struct MovementCache {
+        MultiIndex row_step = make_multi_index(1, 0);
+        MultiIndex col_step = make_multi_index(0, 1);
+        MultiIndex diag_step = make_multi_index(1, 1);
+        
+        __device__ void move_row(auto& coord) {
+            move_tensor_coordinate(Desc{}, coord, row_step);
+        }
+    };
+
+**3. Vectorized Movement**
+
+.. code-block:: cpp
+
+    // Move multiple coordinates simultaneously
+    template<index_t NumCoords>
+    __device__ void vectorized_movement(
+        TensorCoordinate<Desc> coords[NumCoords],
+        const MultiIndex& step)
+    {
+        #pragma unroll
+        for (index_t i = 0; i < NumCoords; ++i) {
+            move_tensor_coordinate(Desc{}, coords[i], step);
+        }
+    }
+
+Integration with CK Tile Components
+===================================
+
+Coordinate movement integrates seamlessly with other CK Tile components:
+
+.. code-block:: cpp
+
+    // Example: Tile window with coordinate movement
+    template<typename TileWindow>
+    __device__ void process_tile_with_movement(
+        TileWindow& window,
+        index_t tile_size)
+    {
+        // Create coordinate for tile traversal
+        auto coord = window.get_tile_coordinate();
+        
+        // Process tile elements with movement
+        for (index_t i = 0; i < tile_size; ++i) {
+            for (index_t j = 0; j < tile_size; ++j) {
+                // Load using coordinate
+                auto value = window.load_at(coord);
+                
+                // Process value
+                auto result = compute(value);
+                
+                // Store result
+                window.store_at(coord, result);
+                
+                // Move to next element
+                window.move_coordinate(coord, {0, 1});
+            }
+            // Move to next row
+            window.move_coordinate(coord, {1, -tile_size});
+        }
+    }
+
+
+Advanced coordinate operations provide the foundation for efficient tensor navigation in CK Tile:
+
+- **TensorCoordinate**: Combines position with descriptor context for validated navigation
+- **TensorAdaptorCoordinate**: Tracks coordinates through transformation chains
+- **move_tensor_coordinate**: Enables efficient incremental updates without recalculation
+- **Movement Patterns**: Support advanced access patterns like tiling and space-filling curves
+- **Performance**: Incremental updates are orders of magnitude faster than coordinate recreation
+- **Integration**: Seamlessly works with tile windows, distributions, and other CK Tile components
+
+These operations are essential for implementing high-performance GPU kernels that can navigate complex tensor layouts efficiently. By understanding and utilizing coordinate movement, kernels can be created that achieve optimal memory access patterns while maintaining code clarity and correctness.
--- a/docs/conceptual/ck_tile/coordinate_systems.rst
+++ b/docs/conceptual/ck_tile/coordinate_systems.rst
@@ -0,0 +1,612 @@
+.. _ck_tile_coordinate_systems:
+
+Coordinate Systems - The Mathematical Foundation
+================================================
+
+Overview
+--------
+
+At the heart of the Composable Kernel framework lies a mathematical foundation based on coordinate transformations. This foundation enables the automatic generation of optimal memory access patterns while maintaining a clear separation between algorithmic intent and hardware implementation details. The coordinate system framework transforms the task of GPU work distribution into a series of well-defined mathematical transformations.
+
+These coordinate systems provide the mathematical machinery that maps abstract thread identities to concrete memory addresses, ensuring that every memory access is optimized for the underlying hardware. This systematic approach eliminates the error-prone manual calculations that plague traditional GPU programming while enabling optimizations that would be impractical to implement by hand.
+
+The Five Coordinate Spaces
+--------------------------
+
+The CK framework employs five interconnected coordinate spaces, each serving a specific purpose in the journey from thread identification to memory access. These spaces work together to solve the fundamental challenge of GPU programming: efficiently distributing work across thousands of parallel threads while maintaining optimal memory access patterns.
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Coordinate Spaces Overview"
+                 P["P-space<br/>Thread Identification<br/>Which thread am I?"]
+                 Y["Y-space<br/>Logical Tile<br/>Which element in my tile?"]
+                 X["X-space<br/>Physical Tensor<br/>Where in the tensor?"]
+                 R["R-space<br/>Replication<br/>Data sharing pattern"]
+                 D["D-space<br/>Linear Storage<br/>Memory address"]
+             end
+   
+             subgraph "Transformations"
+                 T1["P + Y → X<br/>Thread + Element → Position"]
+                 T2["X → D<br/>Position → Address"]
+             end
+   
+             P --> T1
+             Y --> T1
+             T1 --> X
+             X --> T2
+             T2 --> D
+   
+             R -.-> P
+             R -.-> Y
+   
+             style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+             style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/coordinate_systems_1.svg
+   :alt: Diagram
+   :align: center
+
+The Challenge and Solution
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Consider a fundamental scenario: an 8×8 matrix and 4 GPU threads. Each thread needs to answer several critical questions:
+
+1. **Which thread am I?** (Thread identification)
+2. **What work should I do?** (Work assignment)
+3. **Where is my data in the tensor?** (Physical location)
+4. **How do I share data with other threads?** (Cooperation)
+5. **What's the memory address?** (Hardware access)
+
+The coordinate system framework provides a systematic solution through five specialized spaces that transform from logical concepts to physical reality. Each space captures a different aspect of the computation, and the transformations between them encode the distribution strategy.
+
+Thread Identification
+------------------------------
+
+Partition Space (P-space) represents the foundation of the coordinate system hierarchy. This space captures the identity of each processing element within the GPU's execution model, providing a structured way to identify threads across the complex hierarchy of warps, blocks, and grids.
+
+GPU Thread Hierarchy
+~~~~~~~~~~~~~~~~~~~~
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "GPU Thread Hierarchy"
+                 subgraph "Block"
+                     subgraph "Warp 0"
+                         T0["Thread 0<br/>P=[0,0]"]
+                         T1["Thread 1<br/>P=[0,1]"]
+                         T2["Thread 2<br/>P=[0,2]"]
+                         T31["..."]
+                         T3["Thread 31<br/>P=[0,31]"]
+                     end
+                     subgraph "Warp 1"
+                         T32["Thread 32<br/>P=[1,0]"]
+                         T33["Thread 33<br/>P=[1,1]"]
+                         T34["..."]
+                         T63["Thread 63<br/>P=[1,31]"]
+                     end
+                     W2["Warp 2..."]
+                     W7["Warp 7"]
+                 end
+             end
+   
+             subgraph "P-space Mapping"
+                 PM["P-coordinates = [warp_id, lane_id]<br/>or<br/>P-coordinates = [block_x, block_y, thread_x, thread_y]"]
+             end
+   
+             T0 --> PM
+             T32 --> PM
+   
+             style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/coordinate_systems_2.svg
+   :alt: Diagram
+   :align: center
+
+The structure of P-space directly reflects the :ref:`hardware organization <ck_tile_gpu_basics>` of GPUs. Each thread receives a unique P-coordinate that encodes its position within the execution hierarchy. For simple distributions, P-space might be one-dimensional, containing only a thread ID. For complex hierarchical distributions, P-space can have multiple dimensions representing different levels of the GPU's thread organization.
+
+C++ Implementation
+~~~~~~~~~~~~~~~~~~
+
+**File**: ``include/ck_tile/core/container/multi_index.hpp``
+
+.. code-block:: cpp
+
+   #include <ck_tile/core/container/multi_index.hpp>
+   #include <ck_tile/core/utility/thread_id.hpp>
+
+   template <typename TileDistribution>
+   __device__ void example_p_space_calculation()
+   {
+       // Get P-coordinates from hardware thread IDs
+       const index_t thread_id = get_thread_local_1d_id();
+       const index_t warp_id = get_warp_local_1d_id();
+       const index_t lane_id = get_lane_id();
+       
+       // Convert to multi-dimensional P-coordinates
+       auto p_coord_2d = make_multi_index(warp_id, lane_id);
+       
+       // Using tile distribution (preferred method)
+       constexpr auto tile_distribution = TileDistribution{};
+       const auto p_coord = tile_distribution.calculate_p_coord();
+       
+       // P-coordinates determine:
+       // 1. Work distribution - which data this thread processes
+       // 2. Memory coalescing - ensuring optimal access patterns
+       // 3. Thread cooperation - coordinating shared memory usage
+   }
+
+The P-space abstraction enables CK to handle different GPU architectures transparently. Whether running on GPUs with 32-thread warps or 64-thread wavefronts, the P-space coordinates provide a consistent interface while the underlying implementation adapts to the hardware.
+
+Logical Work Organization
+----------------------------------
+
+Yield Space (Y-space) represents the logical organization of work within each thread's assigned tile. While P-space identifies which thread is executing, Y-space defines what that thread does with its assigned work. This abstraction enables the expression of complex access patterns in a hardware-independent manner.
+
+Work Assignment Structure
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Thread's Tile (2x2 elements)"
+                 Y00["Y=[0,0]<br/>Element 0"]
+                 Y01["Y=[0,1]<br/>Element 1"]
+                 Y10["Y=[1,0]<br/>Element 2"]
+                 Y11["Y=[1,1]<br/>Element 3"]
+             end
+   
+             subgraph "Y-space Structure"
+                 YS["Each thread processes<br/>the same Y-space pattern<br/>but at different X locations"]
+             end
+   
+             subgraph "Example: 4 Threads"
+                 T0["Thread 0<br/>P=[0,0]"]
+                 T1["Thread 1<br/>P=[0,1]"]
+                 T2["Thread 2<br/>P=[1,0]"]
+                 T3["Thread 3<br/>P=[1,1]"]
+             end
+   
+             Y00 --> YS
+             Y01 --> YS
+             Y10 --> YS
+             Y11 --> YS
+   
+             T0 --> YS
+             T1 --> YS
+             T2 --> YS
+             T3 --> YS
+   
+             style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/coordinate_systems_3.svg
+   :alt: Diagram
+   :align: center
+
+The power of Y-space lies in its ability to express different iteration patterns without changing the underlying distribution logic. A thread might traverse its Y-space in row-major order for one algorithm, column-major for another, or even use :ref:`space-filling curves <ck_tile_space_filling_curve>` for optimal cache utilization. This flexibility enables algorithm-specific optimizations while maintaining a consistent framework.
+
+Hierarchical Y-Space
+~~~~~~~~~~~~~~~~~~~~
+
+For complex kernels, Y-space can have a hierarchical structure that mirrors the hierarchical nature of GPU architectures:
+
+.. code-block:: cpp
+
+   // Hierarchical Y-space for complex kernels
+   template <typename TileDistribution>
+   __device__ void example_hierarchical_y_space()
+   {
+       constexpr auto tile_distribution = TileDistribution{};
+       
+       // 4D Y-space: [repeat, warp, thread, vector]
+       constexpr auto y_hierarchical = make_tuple(
+           number<4>{},   // Repeat dimension
+           number<2>{},   // Warp dimension  
+           number<8>{},   // Thread dimension
+           number<4>{}    // Vector dimension
+       );
+       
+       // Each dimension serves different purpose:
+       // - Repeat: Algorithm repetition (e.g., attention heads)
+       // - Warp: Inter-warp cooperation patterns
+       // - Thread: Per-thread work items
+       // - Vector: SIMD vectorization
+       
+       // Sweep through Y-space with compile-time unrolling
+       sweep_tile(distributed_tensor, [&](auto y_coord) {
+           // y_coord is compile-time multi_index
+           // All iterations unrolled at compile time
+           auto value = distributed_tensor(y_coord);
+           // Process value...
+       });
+   }
+
+Physical Tensor Coordinates
+------------------------------------
+
+X-space represents the ground truth of data organization: the actual coordinates within the global tensor. This space directly corresponds to how users conceptualize their data: row and column indices for matrices, spatial coordinates for images, or multi-dimensional indices for general tensors.
+
+Memory Layout Mapping
+~~~~~~~~~~~~~~~~~~~~~
+
+The relationship between X-space and physical memory involves considerations of data layout, padding, and alignment:
+
+.. code-block:: cpp
+
+   template <typename TensorDescriptor>
+   __device__ void example_x_space_operations()
+   {
+       constexpr auto tensor_desc = TensorDescriptor{};
+       
+       // X-space properties
+       constexpr auto x_lengths = tensor_desc.get_lengths();
+       constexpr auto x_strides = tensor_desc.get_strides();
+       
+       // Direct X-coordinate specification
+       constexpr auto x_coord = make_multi_index(number<3>{}, number<4>{});
+       
+       // Convert to linear offset
+       constexpr auto linear_offset = tensor_desc.calculate_offset(x_coord);
+       
+       // X-coordinates from P+Y transformation
+       const auto x_from_py = tile_dist.calculate_index(p_coord, y_coord);
+       
+       // Bounds checking
+       const bool valid = is_valid_x_coord(x_coord, x_lengths);
+   }
+
+The Core Transformation: P + Y → X
+----------------------------------
+
+The transformation from P and Y coordinates to X coordinates represents the heart of tile distribution. This transformation encodes the entire distribution strategy, determining how logical thread work maps to physical tensor locations.
+
+Transformation Pipeline
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Input"
+                 P["P-coordinates<br/>Thread identity<br/>P=[1,0]"]
+                 Y["Y-coordinates<br/>Element in tile<br/>Y=[0,1]"]
+             end
+   
+             subgraph "Transformation"
+                 T["P + Y → X<br/>Base position + Offset"]
+             end
+   
+             subgraph "Output"
+                 X["X-coordinates<br/>Tensor position<br/>X=[2,1]"]
+             end
+   
+             subgraph "Example"
+                 E["Thread P=[1,0] at base (2,0)<br/>Element Y=[0,1] adds offset (0,1)<br/>Result X=[2,1] in tensor"]
+             end
+   
+             P --> T
+             Y --> T
+             T --> X
+             X --> E
+   
+             style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/coordinate_systems_4.svg
+   :alt: Diagram
+   :align: center
+
+Mathematical Foundation
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The P+Y→X transformation can be expressed mathematically as a composition of functions:
+
+.. math::
+
+   X = f(P, Y) = BasePosition(P) + LocalOffset(Y)
+
+Where:
+- BasePosition(P) determines where in the tensor this thread's tile begins
+- LocalOffset(Y) specifies the offset within the tile
+
+This transformation is highly configurable through the distribution encoding, enabling different strategies for different algorithms while maintaining the same mathematical framework.
+
+Replication and Cooperation
+------------------------------------
+
+Replication Space (R-space) introduces a mechanism for expressing data sharing and cooperation patterns between threads. Unlike the other coordinate spaces which map to unique data elements, R-space enables multiple processing elements to work on the same data, facilitating communication and reduction operations.
+
+Replication Patterns
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: cpp
+
+   template <typename TileDistribution>
+   __device__ void example_r_space_operations()
+   {
+       constexpr auto tile_distribution = TileDistribution{};
+       constexpr auto r_lengths = tile_distribution.get_r_lengths();
+       
+       // Broadcasting with R-space
+       template <typename DataType>
+       __device__ auto broadcast_across_r_space(DataType value)
+       {
+           const auto r_coord = tile_distribution.calculate_r_coord();
+           __shared__ DataType shared_value;
+           
+           if (r_coord == make_multi_index(0, 0)) {
+               shared_value = value;  // Source thread
+           }
+           __syncthreads();
+           
+           return shared_value;  // All threads get the value
+       }
+       
+       // Reduction across R-space
+       template <typename DataType>
+       __device__ auto reduce_across_r_space(DataType local_value)
+       {
+           // Use hardware-accelerated reduction
+           return block_reduce_sum(local_value);
+       }
+   }
+
+R-space enables cooperation patterns that would be difficult to express otherwise. By providing a systematic way to identify which threads share data, it enables automatic generation of communication patterns.
+
+Memory Linearization
+-----------------------------
+
+D-space represents the final transformation in the coordinate pipeline: converting multi-dimensional coordinates to linear memory addresses. This transformation incorporates all the low-level details of memory layout, including stride patterns, padding, and alignment requirements.
+
+Linearization Strategies
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "X-coordinates"
+                 X["X = [2, 3]<br/>2D Position"]
+             end
+   
+             subgraph "Layout Options"
+                 RM["Row-Major<br/>D = 2×width + 3"]
+                 CM["Column-Major<br/>D = 3×height + 2"]
+                 BL["Blocked<br/>Complex pattern"]
+             end
+   
+             subgraph "D-coordinate"
+                 D["D = 11<br/>Linear Address"]
+             end
+   
+             X --> RM
+             X --> CM
+             X --> BL
+             RM --> D
+   
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+      
+      
+   
+   
+   
+
+.. image:: diagrams/coordinate_systems_5.svg
+   :alt: Diagram
+   :align: center
+
+The linearization process must consider multiple factors:
+
+.. code-block:: cpp
+
+   template <typename TensorDescriptor>
+   __device__ void example_d_space_linearization()
+   {
+       // Standard linearization
+       template <typename XCoord>
+       __device__ constexpr auto calculate_linear_offset(const XCoord& x_coord)
+       {
+           index_t offset = 0;
+           static_for<0, ndim, 1>{}([&](auto dim) {
+               offset += x_coord.at(dim) * strides.at(dim);
+           });
+           return offset;
+       }
+       
+       // Specialized patterns for optimization
+       // Row-major: offset = x0 * N + x1
+       // Column-major: offset = x1 * M + x0
+       // Blocked: Complex pattern for cache efficiency
+   }
+
+Complete Pipeline Example
+-------------------------
+
+The following is a complete example showing how all coordinate spaces work together:
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Step 1: Thread Identification"
+                 TID["Thread ID = 5"]
+                 P["P-coordinates<br/>P = [0, 5]<br/>(warp 0, lane 5)"]
+             end
+   
+             subgraph "Step 2: Work Assignment"
+                 Y["Y-coordinates<br/>Y = [1, 0]<br/>(element in tile)"]
+             end
+   
+             subgraph "Step 3: P+Y Transformation"
+                 TRANS["P + Y → X<br/>Thread position + Element offset"]
+                 X["X-coordinates<br/>X = [1, 5]<br/>(tensor position)"]
+             end
+   
+             subgraph "Step 4: Linearization"
+                 LIN["X → D<br/>Row-major: D = x₀ × width + x₁"]
+                 D["D-coordinate<br/>D = 13<br/>(memory address)"]
+             end
+   
+             subgraph "Step 5: Memory Access"
+                 MEM["Hardware accesses<br/>memory[13]"]
+             end
+   
+             TID --> P
+             P --> TRANS
+             Y --> TRANS
+             TRANS --> X
+             X --> LIN
+             LIN --> D
+             D --> MEM
+   
+             style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
+             style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
+             style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
+      
+      
+
+
+.. image:: diagrams/coordinate_systems_6.svg
+   :alt: Diagram
+   :align: center
+
+Real-World Example: Matrix Multiplication
+-----------------------------------------
+
+:ref:`matrix multiplication <ck_tile_gemm_optimization>` demonstrates how coordinate systems work in practice/
+
+.. code-block:: cpp
+
+   template<typename AType, typename BType, typename CType>
+   __global__ void gemm_kernel_with_coordinates(
+       const AType* a_ptr, const BType* b_ptr, CType* c_ptr,
+       index_t M, index_t N, index_t K)
+   {
+       // Define distribution encoding
+       using Encoding = tile_distribution_encoding<
+           sequence<>,                              // R: no replication
+           tuple<sequence<4, 2, 8, 4>,             // H for M dimension
+                 sequence<4, 2, 8, 4>>,            // H for N dimension
+           tuple<sequence<1, 2>, sequence<1, 2>>,  // P mappings
+           tuple<sequence<1, 1>, sequence<2, 2>>,  // P minor
+           sequence<1, 1, 2, 2>,                   // Y major
+           sequence<0, 3, 0, 3>                    // Y minor
+       >;
+       
+       constexpr auto distribution = make_static_tile_distribution(Encoding{});
+       
+       // Step 1: Get P-coordinates (thread identity)
+       const auto p_coord = distribution.calculate_p_coord();
+       
+       // Step 2: Iterate through Y-space (work assignment)
+       sweep_tile(c_tile, [&](auto y_coord) {
+           // Step 3: P+Y→X transformation
+           const auto x_coord = distribution.calculate_index(p_coord, y_coord);
+           
+           // Step 4: X→D transformation (handled by tensor view)
+           // Step 5: Actual computation at these coordinates
+           c_tile(y_coord) = compute_element(x_coord);
+       });
+   }
+
+Performance Implications
+------------------------
+
+The coordinate system framework enables several critical optimizations:
+
+**Memory Coalescing**: By carefully structuring the P+Y→X transformation, consecutive threads access consecutive memory locations, achieving optimal memory bandwidth utilization.
+
+**Cache Efficiency**: The Y-space traversal order can be designed to maximize cache reuse, keeping frequently accessed data in fast memory.
+
+**Register Optimization**: The Y→D transformation enables optimal register allocation, minimizing register pressure while maximizing reuse.
+
+**Vectorization**: The coordinate transformations naturally align with vector operations, enabling efficient use of SIMD instructions.
+
+Summary
+-------
+
+The coordinate system framework represents the mathematical foundation that enables CK's high performance and productivity benefits. Through the systematic transformation from thread identity (P-space) through logical work organization (Y-space) to physical tensor coordinates (X-space) and finally to linear memory addresses (D-space), this framework solves the fundamental challenges of GPU programming.
+
+Key insights from the coordinate system framework:
+
+**Separation of Concerns**: Each coordinate space captures a different aspect of the computation, enabling independent optimization of each aspect while maintaining a coherent whole.
+
+**Mathematical Rigor**: The transformations between coordinate spaces are well-defined mathematical functions, enabling formal analysis and verification of distribution strategies.
+
+**Hardware Abstraction**: The framework abstracts hardware details while enabling hardware-specific optimizations, achieving both portability and performance.
+
+**Automatic Optimization**: By encoding distribution strategies as coordinate transformations, the framework enables automatic generation of optimal access patterns that would be impractical to implement manually.
+
+**Composability**: Different distribution strategies can be expressed by composing different transformations, enabling rapid experimentation and optimization.
+
+These coordinate systems provide the conceptual framework for reasoning about GPU computation and the practical tools for achieving optimal performance. As GPU architectures continue to evolve, this mathematical foundation ensures that CK programs can adapt and continue to achieve high performance.
+
+Next Steps
+----------
+
+With a solid understanding of the coordinate system framework, the next sections explore how these concepts are applied in practice. Return to :ref:`ck_tile_index` to see the structure of the complete CK Tile documentation.
--- a/docs/conceptual/ck_tile/descriptors.rst
+++ b/docs/conceptual/ck_tile/descriptors.rst
@@ -0,0 +1,383 @@
+.. _ck_tile_descriptors:
+
+Tensor Descriptors - Complete Tensor Specifications
+===================================================
+
+Overview
+--------
+
+A TensorDescriptor is the complete blueprint for a tensor. It combines a shape, stride information, and a series of :ref:`transformations <ck_tile_transforms>` into a single object that defines exactly how a tensor's data is laid out in memory. This specification enables CK Tile to create complex tensor views without any data movement.
+
+In CK Tile, TensorDescriptors serve as the foundation for all tensor operations, providing:
+
+- **Memory Layout Specification**: How data is arranged in physical memory
+- **Logical View Definition**: How the tensor appears to the programmer
+- **Transformation Pipeline**: A series of :ref:`coordinate transformations <ck_tile_coordinate_systems>`
+- **Zero-Copy Views**: Different logical representations of the same data, building on :ref:`BufferViews <ck_tile_buffer_views>` and :ref:`TensorViews <ck_tile_tensor_views>`
+
+Creating Basic Tensor Layouts
+-----------------------------
+
+CK Tile provides several ways to create tensor descriptors for common memory layouts.
+
+Custom Strides
+~~~~~~~~~~~~~~
+
+The most fundamental way to define a tensor is with custom strides. This provides full control over how many elements to "jump" in memory to move to the next item along each dimension. This is particularly useful for creating padded layouts required by GPU algorithms.
+
+.. code-block:: cpp
+
+   using namespace ck_tile;
+   
+   // Create a 3x4 tensor, but make each row take up 8 elements in memory 
+   // (4 for data, 4 for padding)
+   constexpr auto M = 3;
+   constexpr auto N = 4;
+   constexpr auto RowStride = 8;  // Padded stride
+   
+   auto descriptor = make_naive_tensor_descriptor(
+       make_tuple(M, N),           // Shape: [3, 4]
+       make_tuple(RowStride, 1)    // Strides: [8, 1]
+   );
+   
+   // The total memory needed is 3 rows * 8 elements/row = 24
+   constexpr auto element_space_size = M * RowStride;
+   
+   // Calculate offset of the element at [row=1, col=2]
+   multi_index<2> coord{1, 2};
+   auto offset = descriptor.calculate_offset(coord);
+   // offset = 1*8 + 2*1 = 10
+
+Packed Row-Major Layout
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For most cases, a tightly packed, row-major layout is sufficient. The strides are calculated automatically, leaving no unused space between elements.
+
+.. code-block:: cpp
+
+   using namespace ck_tile;
+   
+   // Create a packed 3x4 tensor
+   auto descriptor_packed = make_naive_tensor_descriptor_packed(
+       make_tuple(3, 4)
+   );
+   
+   // Total memory is 3 * 4 = 12 elements
+   // Strides are automatically [4, 1] for row-major layout
+   
+   // Calculate offset of the element at [row=1, col=2]
+   multi_index<2> coord{1, 2};
+   auto offset = descriptor_packed.calculate_offset(coord);
+   // offset = 1*4 + 2*1 = 6
+
+Aligned Layout
+~~~~~~~~~~~~~~
+
+For GPU performance, memory layouts often need to be aligned. This function creates a row-major layout but ensures that each row's starting address is a multiple of a given alignment value, adding padding if necessary.
+
+.. code-block:: cpp
+
+   using namespace ck_tile;
+   
+   // Create a 4x5 tensor with 8-element alignment
+   constexpr auto align = 8;  // Align each row to 8-element boundary
+   
+   auto descriptor_aligned = make_naive_tensor_descriptor_aligned(
+       make_tuple(4, 5),
+       align
+   );
+   
+   // Without alignment, size would be 4*5=20
+   // With alignment, the row stride becomes 8 (smallest multiple of 8 >= 5)
+   // Total size = 4 rows * 8 elements/row = 32
+
+The Pipeline Concept
+--------------------
+
+Every TensorDescriptor in CK Tile can be thought of as a **transformation pipeline**. The functions above create the *first stage* of this pipeline, defining the initial :ref:`transformation <ck_tile_transforms>` that takes a simple, one-dimensional block of memory and presents it as a logical, multi-dimensional tensor view.
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Pipeline Stages"
+                 S1["Stage 1<br/>Base Layout<br/>[M, N]"] 
+                 S2["Stage 2<br/>Transform<br/>Unmerge"]
+                 S3["Stage 3<br/>New View<br/>[M1, M2, N]"]
+                 S4["Stage N<br/>Final View<br/>[...]"]
+             end
+             
+             subgraph "Same Data"
+                 D["Physical Memory<br/>No data movement"]
+             end
+             
+             S1 --> S2
+             S2 --> S3
+             S3 --> S4
+             
+             S1 -.-> D
+             S2 -.-> D
+             S3 -.-> D
+             S4 -.-> D
+             
+             style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
+             style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+.. image:: diagrams/descriptors_1.svg
+   :alt: Diagram
+   :align: center
+
+.. image:: diagrams/descriptors_1.svg
+   :alt: Diagram
+   :align: center
+
+The Initial Pipeline Stage
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A simple packed descriptor sets up a pipeline with a single transform:
+
+- **Input**: The raw, one-dimensional memory buffer (hidden dimension ID 0)
+- **Output**: The logical dimensions that you interact with (hidden dimension IDs 1, 2, ...)
+
+This initial stage converts linear memory addresses into multi-dimensional coordinates. See :ref:`ck_tile_adaptors` for how transforms chain together.
+
+Advanced Layouts: Step-by-Step Transformation
+---------------------------------------------
+
+The ``transform_tensor_descriptor`` function adds new stages to an existing descriptor's pipeline using :ref:`transforms <ck_tile_transforms>`. 
+
+Transform a [2, 6] Tensor into a [2, 2, 3] View
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example reinterprets a 2D tensor with shape [2, 6] as a 3D tensor with shape [2, 2, 3], without changing the underlying 12-element memory buffer.
+
+**Step 1: Define the Base Descriptor**
+
+.. code-block:: cpp
+
+   using namespace ck_tile;
+   
+   // Create the [2, 6] base descriptor
+   auto base_descriptor = make_naive_tensor_descriptor_packed(
+       make_tuple(2, 6)
+   );
+   
+   // This creates an initial pipeline stage that:
+   // - Takes the raw buffer (hidden ID 0) as input
+   // - Produces two outputs (hidden IDs 1 and 2)
+   // - These outputs become logical dimensions 0 and 1
+
+**Step 2: Define the New Transformation Stage**
+
+To get from [2, 6] to [2, 2, 3], we need:
+
+- **For logical dimension 0 (length 2)**: Preserve it with PassThroughTransform
+- **For logical dimension 1 (length 6)**: Split it with UnmergeTransform([2, 3])
+
+**Step 3: Apply Transformation**
+
+.. code-block:: cpp
+
+   // Create the transformed descriptor
+   auto transformed_descriptor = transform_tensor_descriptor(
+       base_descriptor,
+       make_tuple(
+           make_pass_through_transform(2),      // For dim 0
+           make_unmerge_transform(make_tuple(2, 3))  // For dim 1
+       ),
+       make_tuple(sequence<0>{}, sequence<1>{}),     // Input mapping
+       make_tuple(sequence<0>{}, sequence<1, 2>{})   // Output mapping
+   );
+   
+   // Result: A [2, 2, 3] view of the same data
+
+Analysis of the Final Pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Transform Pipeline"
+                 T0["Transform 0<br/>Base Unmerge<br/>Input: [0]<br/>Output: [1,2]"]
+                 T1["Transform 1<br/>PassThrough<br/>Input: [1]<br/>Output: [3]"]
+                 T2["Transform 2<br/>Unmerge<br/>Input: [2]<br/>Output: [4,5]"]
+             end
+             
+             subgraph "Hidden Dimensions"
+                 H0["Hidden ID 0<br/>Raw Buffer"]
+                 H1["Hidden ID 1<br/>Dim 0 (size 2)"]
+                 H2["Hidden ID 2<br/>Dim 1 (size 6)"]
+                 H3["Hidden ID 3<br/>Final Dim 0"]
+                 H4["Hidden ID 4<br/>Final Dim 1"]
+                 H5["Hidden ID 5<br/>Final Dim 2"]
+             end
+             
+             H0 --> T0
+             T0 --> H1
+             T0 --> H2
+             H1 --> T1
+             H2 --> T2
+             T1 --> H3
+             T2 --> H4
+             T2 --> H5
+             
+             style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
+             style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+
+.. image:: diagrams/descriptors_2.svg
+   :alt: Diagram
+   :align: center
+
+.. image:: diagrams/descriptors_2.svg
+   :alt: Diagram
+   :align: center
+
+The pipeline now has three stages:
+
+1. **Base UnmergeTransform**: Converts raw buffer to [2, 6] layout
+2. **PassThroughTransform**: Preserves the first dimension
+3. **UnmergeTransform**: Splits the second dimension into [2, 3]
+
+5D to 3D Block Transformation
+-----------------------------------------------------
+
+These concepts are critical in :ref:`GPU programming <ck_tile_gpu_basics>`. This example transforms a 5D tensor representing a GPU thread block's workload into a simpler 3D view using MergeTransform. See :ref:`ck_tile_thread_mapping` for thread distribution details.
+
+.. code-block:: cpp
+
+   using namespace ck_tile;
+   
+   // Define parameters (typical for a GPU block)
+   constexpr auto Block_M = 256;
+   constexpr auto NumWarps = 8;
+   constexpr auto WarpSize = 64;
+   constexpr auto KVector = 4;
+   constexpr auto wavesPerK = 2;
+   constexpr auto wavesPerM = NumWarps / wavesPerK;
+   constexpr auto NumIssues = Block_M / wavesPerM;
+   
+   // Create the base 5D descriptor
+   auto base_descriptor = make_naive_tensor_descriptor_packed(
+       make_tuple(NumIssues, wavesPerM, wavesPerK, WarpSize, KVector)
+   );
+   
+   // Transform to 3D by merging dimensions
+   auto transformed_descriptor = transform_tensor_descriptor(
+       base_descriptor,
+       make_tuple(
+           make_pass_through_transform(NumIssues),
+           make_merge_transform(make_tuple(wavesPerM, wavesPerK)),
+           make_merge_transform(make_tuple(WarpSize, KVector))
+       ),
+       make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
+       make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})
+   );
+   
+   // Result: [NumIssues, wavesPerM*wavesPerK, WarpSize*KVector]
+   // This simplifies thread block management while preserving data layout
+
+Common Descriptor Patterns
+--------------------------
+
+Matrix Transposition
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: cpp
+
+   // Create a transposed view of a matrix
+   auto transposed = transform_tensor_descriptor(
+       original_matrix,
+       make_tuple(
+           make_pass_through_transform(N),
+           make_pass_through_transform(M)
+       ),
+       make_tuple(sequence<1>{}, sequence<0>{}),  // Swap dimensions
+       make_tuple(sequence<0>{}, sequence<1>{})
+   );
+
+Padding for Convolution
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: cpp
+
+// Add padding to spatial dimensions
+   auto padded = transform_tensor_descriptor(
+       input_tensor,
+       make_tuple(
+           make_pass_through_transform(N),    // Batch
+           make_pass_through_transform(C),    // Channel
+           make_pad_transform(H, pad_h, pad_h),  // Height
+           make_pad_transform(W, pad_w, pad_w)   // Width
+       ),
+       make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+       make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{})
+   );
+
+For a complete convolution example, see :ref:`ck_tile_convolution_example`.
+
+Tensor Slicing
+~~~~~~~~~~~~~~
+
+.. code-block:: cpp
+
+   // Extract a sub-tensor
+   auto slice = transform_tensor_descriptor(
+       full_tensor,
+       make_tuple(
+           make_slice_transform(M, start_m, end_m),
+           make_slice_transform(N, start_n, end_n)
+       ),
+       make_tuple(sequence<0>{}, sequence<1>{}),
+       make_tuple(sequence<0>{}, sequence<1>{})
+   );
+
+Key Concepts Summary
+--------------------
+
+TensorDescriptors provide a key abstraction for tensor manipulation:
+
+- **Pipeline Architecture**: Each descriptor is a transformation pipeline
+- **Zero-Copy Views**: All transformations are logical, no data movement
+- **Composability**: Complex layouts built from simple transforms
+- **GPU Optimization**: Designed for efficient GPU memory access patterns
+
+Important principles:
+
+1. **Always Handle All Dimensions**: When transforming, provide a transform for each input dimension
+2. **Hidden Dimension IDs**: Track the flow of data through the pipeline
+3. **Compile-Time Resolution**: All transformations resolved at compile time
+4. **Type Safety**: Template metaprogramming ensures correctness
+
+Performance Considerations
+--------------------------
+
+When designing tensor descriptors for GPU kernels:
+
+1. **Memory Coalescing**: Ensure contiguous threads access contiguous memory
+2. **Bank Conflicts**: Avoid patterns that cause :ref:`shared memory conflicts <ck_tile_lds_bank_conflicts>`
+3. **Alignment**: Use aligned layouts for better memory throughput
+4. **Padding**: Strategic padding can improve access patterns. Ssee :ref:`ck_tile_lds_index_swapping` for advanced techniques.
+
+Next Steps
+----------
+
+- :ref:`ck_tile_tile_window` - Using descriptors for efficient data loading
+- :ref:`ck_tile_tile_distribution` - How descriptors enable automatic work distribution
+- :ref:`ck_tile_convolution_example` - Real-world application of complex descriptors
+- :ref:`ck_tile_static_distributed_tensor` - Managing distributed tensors with descriptors
+- :ref:`ck_tile_gemm_optimization` - GEMM kernels using descriptor transformations
--- a/docs/conceptual/ck_tile/diagrams/adaptors_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/adaptors_1.svg
--- a/docs/conceptual/ck_tile/diagrams/adaptors_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/adaptors_2.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg
--- a/docs/conceptual/ck_tile/diagrams/convolution_example.svg
+++ b/docs/conceptual/ck_tile/diagrams/convolution_example.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_movement.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_movement.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg
--- a/docs/conceptual/ck_tile/diagrams/descriptors_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/descriptors_1.svg
--- a/docs/conceptual/ck_tile/diagrams/descriptors_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/descriptors_2.svg
--- a/docs/conceptual/ck_tile/diagrams/encoding_internals_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/encoding_internals_1.svg
--- a/docs/conceptual/ck_tile/diagrams/encoding_internals_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/encoding_internals_2.svg
--- a/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg
--- a/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg
--- a/docs/conceptual/ck_tile/diagrams/lds_index_swapping_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/lds_index_swapping_1.svg
--- a/docs/conceptual/ck_tile/diagrams/lds_index_swapping_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/lds_index_swapping_2.svg
--- a/docs/conceptual/ck_tile/diagrams/lds_index_swapping_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/lds_index_swapping_3.svg
--- a/docs/conceptual/ck_tile/diagrams/load_store_traits_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/load_store_traits_1.svg
--- a/docs/conceptual/ck_tile/diagrams/load_store_traits_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/load_store_traits_2.svg
--- a/docs/conceptual/ck_tile/diagrams/space_filling_curve.svg
+++ b/docs/conceptual/ck_tile/diagrams/space_filling_curve.svg
--- a/docs/conceptual/ck_tile/diagrams/static_distributed_tensor.svg
+++ b/docs/conceptual/ck_tile/diagrams/static_distributed_tensor.svg
--- a/docs/conceptual/ck_tile/diagrams/sweep_tile_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/sweep_tile_1.svg
--- a/docs/conceptual/ck_tile/diagrams/sweep_tile_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/sweep_tile_2.svg
--- a/docs/conceptual/ck_tile/diagrams/sweep_tile_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/sweep_tile_3.svg
--- a/docs/conceptual/ck_tile/diagrams/sweep_tile_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/sweep_tile_4.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_coordinates_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_coordinates_1.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_coordinates_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_coordinates_2.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg
--- a/docs/conceptual/ck_tile/diagrams/thread_mapping_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/thread_mapping_1.svg
--- a/Show More
+++ b/Show More