Fix remaining images

2026-06-30 11:47:48 +00:00 · 2025-10-14 07:09:40 -07:00
parent c80df7a092
commit c72bd792c8
44 changed files with 2164 additions and 1530 deletions
--- a/docs/conceptual/ck_tile/adaptors.rst
+++ b/docs/conceptual/ck_tile/adaptors.rst
@@ -18,35 +18,43 @@ A TensorAdaptor encapsulates a sequence of :ref:`coordinate transformations <ck_
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph LR
-          subgraph "Adaptor Composition"
-              subgraph "Single Transform"
-                  direction TB
-                  I1["Input Coords<br/>[0,1,2]"]
-                  T1["Transform<br/>(e.g., Transpose)"]
-                  O1["Output Coords<br/>[2,0,1]"]
-                  I1 --> T1 --> O1
-              end
-   
-              subgraph "Chained Transforms"
-                  direction TB
-                  I2["Input<br/>2D"]
-                  T2A["Transform A<br/>(e.g., Merge)"]
-                  M2["Intermediate<br/>1D"]
-                  T2B["Transform B<br/>(e.g., Pad)"]
-                  O2["Output<br/>1D Padded"]
-                  I2 --> T2A --> M2 --> T2B --> O2
-              end
-          end
-   
-          style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-          style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      .. mermaid::
+      
+         graph LR
+             subgraph "Adaptor Composition"
+                 subgraph "Single Transform"
+                     direction TB
+                     I1["Input Coords<br/>[0,1,2]"]
+                     T1["Transform<br/>(e.g., Transpose)"]
+                     O1["Output Coords<br/>[2,0,1]"]
+                     I1 --> T1 --> O1
+                 end
+      
+                 subgraph "Chained Transforms"
+                     direction TB
+                     I2["Input<br/>2D"]
+                     T2A["Transform A<br/>(e.g., Merge)"]
+                     M2["Intermediate<br/>1D"]
+                     T2B["Transform B<br/>(e.g., Pad)"]
+                     O2["Output<br/>1D Padded"]
+                     I2 --> T2A --> M2 --> T2B --> O2
+                 end
+             end
+      
+             style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/adaptors_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/adaptors_1.svg
   :alt: Diagram
   :align: center
@@ -118,44 +126,52 @@ The real power of adaptors comes from chaining multiple transformations together
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph LR
-          subgraph "Adaptor Chaining Flow"
-              subgraph "Adaptor 1"
-                  A1I["Bottom Dims<br/>[0,1]"]
-                  A1T["Transform:<br/>Merge[2,3]"]
-                  A1O["Top Dims<br/>[0]"]
-              end
-              
-              subgraph "Adaptor 2"
-                  A2I["Bottom Dims<br/>[0]"]
-                  A2T["Transform:<br/>Unmerge[2,3]"]
-                  A2O["Top Dims<br/>[0,1]"]
-              end
-              
-              subgraph "Chained Result"
-                  CI["Input 2D<br/>Bottom[0,1]"]
-                  CO["Output 2D<br/>Top[0,1]"]
-              end
-          end
-          
-          A1I --> A1T
-          A1T --> A1O
-          A1O --> A2I
-          A2I --> A2T
-          A2T --> A2O
-          
-          CI --> A1I
-          A2O --> CO
-          
-          style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-          style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      .. mermaid::
+      
+         graph LR
+             subgraph "Adaptor Chaining Flow"
+                 subgraph "Adaptor 1"
+                     A1I["Bottom Dims<br/>[0,1]"]
+                     A1T["Transform:<br/>Merge[2,3]"]
+                     A1O["Top Dims<br/>[0]"]
+                 end
+                 
+                 subgraph "Adaptor 2"
+                     A2I["Bottom Dims<br/>[0]"]
+                     A2T["Transform:<br/>Unmerge[2,3]"]
+                     A2O["Top Dims<br/>[0,1]"]
+                 end
+                 
+                 subgraph "Chained Result"
+                     CI["Input 2D<br/>Bottom[0,1]"]
+                     CO["Output 2D<br/>Top[0,1]"]
+                 end
+             end
+             
+             A1I --> A1T
+             A1T --> A1O
+             A1O --> A2I
+             A2I --> A2T
+             A2T --> A2O
+             
+             CI --> A1I
+             A2O --> CO
+             
+             style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/adaptors_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/adaptors_2.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/buffer_views.rst
+++ b/docs/conceptual/ck_tile/buffer_views.rst
@@ -23,40 +23,52 @@ Memory coherence and caching policies represent another layer of complexity that
 Address Space Usage Patterns
 ----------------------------

-.. raw:: html
-
-   <div class="mermaid">
-   flowchart TB
-       subgraph CF ["Compute Flow"]
-           direction LR
-           GM1["Global Memory<br/>Input Data"] --> LDS["LDS<br/>Tile Cache"]
-           LDS --> VGPR["VGPR<br/>Working Set"]
-           VGPR --> Compute["Compute<br/>Operations"]
-           Compute --> VGPR
-           VGPR --> LDS2["LDS<br/>Reduction"]
-           LDS2 --> GM2["Global Memory<br/>Output Data"]
-       end
-       
-       subgraph UP ["Usage Pattern"]
-           direction LR
-           P1["1. Load tile from Global → LDS"]
-           P2["2. Load working set LDS → VGPR"]
-           P3["3. Compute in VGPR"]
-           P4["4. Store results VGPR → LDS"]
-           P5["5. Reduce in LDS"]
-           P6["6. Write final LDS → Global"]
-           
-           P1 --> P2 --> P3 --> P4 --> P5 --> P6
-       end
-       
-       CF ~~~ UP
-       
-       style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
-       style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart TB
+             subgraph CF ["Compute Flow"]
+                 direction LR
+                 GM1["Global Memory<br/>Input Data"] --> LDS["LDS<br/>Tile Cache"]
+                 LDS --> VGPR["VGPR<br/>Working Set"]
+                 VGPR --> Compute["Compute<br/>Operations"]
+                 Compute --> VGPR
+                 VGPR --> LDS2["LDS<br/>Reduction"]
+                 LDS2 --> GM2["Global Memory<br/>Output Data"]
+             end
+   
+             subgraph UP ["Usage Pattern"]
+                 direction LR
+                 P1["1. Load tile from Global → LDS"]
+                 P2["2. Load working set LDS → VGPR"]
+                 P3["3. Compute in VGPR"]
+                 P4["4. Store results VGPR → LDS"]
+                 P5["5. Reduce in LDS"]
+                 P6["6. Write final LDS → Global"]
+   
+                 P1 --> P2 --> P3 --> P4 --> P5 --> P6
+             end
+   
+             CF ~~~ UP
+   
+             style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
+             style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/buffer_views_1.svg
+   :alt: Diagram
+   :align: center
 C++ Implementation
 ------------------

@@ -176,76 +188,100 @@ The implementation of vector access maintains the same parameter structure as sc
 Scalar vs Vectorized Memory Access
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   graph LR
-       subgraph "Scalar Access (4 instructions)"
-           S1["Load float[0]"] --> R1["Register 1"]
-           S2["Load float[1]"] --> R2["Register 2"]
-           S3["Load float[2]"] --> R3["Register 3"]
-           S4["Load float[3]"] --> R4["Register 4"]
-       end
-       
-       subgraph "Vectorized Access (1 instruction)"
-           V1["Load float4[0]"] --> VR["Vector Register<br/>(4 floats)"]
-       end
-       
-       subgraph "Performance Impact"
-           Perf["4x fewer instructions<br/>Better memory bandwidth<br/>Reduced latency"]
-       end
-       
-       R1 & R2 & R3 & R4 --> Perf
-       VR --> Perf
-       
-       style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Scalar Access (4 instructions)"
+                 S1["Load float[0]"] --> R1["Register 1"]
+                 S2["Load float[1]"] --> R2["Register 2"]
+                 S3["Load float[2]"] --> R3["Register 3"]
+                 S4["Load float[3]"] --> R4["Register 4"]
+             end
+   
+             subgraph "Vectorized Access (1 instruction)"
+                 V1["Load float4[0]"] --> VR["Vector Register<br/>(4 floats)"]
+             end
+   
+             subgraph "Performance Impact"
+                 Perf["4x fewer instructions<br/>Better memory bandwidth<br/>Reduced latency"]
+             end
+   
+             R1 & R2 & R3 & R4 --> Perf
+             VR --> Perf
+   
+             style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/buffer_views_2.svg
+   :alt: Diagram
+   :align: center
 Understanding BufferView Indexing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   flowchart LR
-       subgraph "Input Parameters"
-           Offset["Offset<br/>(e.g., 5)"]
-           ValidFlag["Valid Flag<br/>(optional)"]
-       end
-       
-       subgraph "Processing"
-           BoundsCheck{{"Bounds Check<br/>offset < buffer_size?"}}
-           FlagCheck{{"Flag Check<br/>valid_flag == True?"}}
-           Access["Access Memory<br/>buffer[offset]"]
-       end
-       
-       subgraph "Output"
-           ValidResult["Valid Result<br/>Return value"]
-           Invalid["Invalid Result<br/>Return 0 or default"]
-       end
-       
-       Offset --> BoundsCheck
-       ValidFlag --> FlagCheck
-       
-       BoundsCheck -->|Yes| FlagCheck
-       BoundsCheck -->|No| Invalid
-       
-       FlagCheck -->|Yes| Access
-       FlagCheck -->|No| Invalid
-       
-       Access --> ValidResult
-       
-       style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
-       style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
-       style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart LR
+             subgraph "Input Parameters"
+                 Offset["Offset<br/>(e.g., 5)"]
+                 ValidFlag["Valid Flag<br/>(optional)"]
+             end
+   
+             subgraph "Processing"
+                 BoundsCheck{{"Bounds Check<br/>offset < buffer_size?"}}
+                 FlagCheck{{"Flag Check<br/>valid_flag == True?"}}
+                 Access["Access Memory<br/>buffer[offset]"]
+             end
+   
+             subgraph "Output"
+                 ValidResult["Valid Result<br/>Return value"]
+                 Invalid["Invalid Result<br/>Return 0 or default"]
+             end
+   
+             Offset --> BoundsCheck
+             ValidFlag --> FlagCheck
+   
+             BoundsCheck -->|Yes| FlagCheck
+             BoundsCheck -->|No| Invalid
+   
+             FlagCheck -->|Yes| Access
+             FlagCheck -->|No| Invalid
+   
+             Access --> ValidResult
+   
+             style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/buffer_views_3.svg
+   :alt: Diagram
+   :align: center
 C++ Get Operations
 ~~~~~~~~~~~~~~~~~~

@@ -341,28 +377,40 @@ Atomic Operations
 Atomic vs Non-Atomic Operations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Non-Atomic Operation (Race Condition)"
-           NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
-           NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
-           NA2 --> NA5["Thread 1: Write 15"]
-           NA4 --> NA6["Thread 2: Write 13"]
-           NA5 & NA6 --> NA7["Final value: 13 ❌<br/>(Lost update from Thread 1)"]
-       end
-       
-       subgraph "Atomic Operation (Thread-Safe)"
-           A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures<br/>serialization"]
-           A3["Thread 2: atomic_add(3)"] --> A2
-           A2 --> A4["Final value: 18 ✓<br/>(Both updates applied)"]
-       end
-       
-       style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Non-Atomic Operation (Race Condition)"
+                 NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
+                 NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
+                 NA2 --> NA5["Thread 1: Write 15"]
+                 NA4 --> NA6["Thread 2: Write 13"]
+                 NA5 & NA6 --> NA7["Final value: 13 ❌<br/>(Lost update from Thread 1)"]
+             end
+   
+             subgraph "Atomic Operation (Thread-Safe)"
+                 A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures<br/>serialization"]
+                 A3["Thread 2: atomic_add(3)"] --> A2
+                 A2 --> A4["Final value: 18 ✓<br/>(Both updates applied)"]
+             end
+   
+             style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/buffer_views_4.svg
+   :alt: Diagram
+   :align: center
 C++ Atomic Operations
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
+++ b/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
@@ -31,11 +31,16 @@ RST_FILES = [
    'descriptors.rst',
    'coordinate_movement.rst',
    'adaptors.rst',
+    'introduction_motivation.rst',
+    'buffer_views.rst',
+    'tensor_views.rst',
+    'coordinate_systems.rst',
+    'tile_distribution.rst',
 ]

-# Pattern to find mermaid blocks
+# Pattern to find mermaid blocks (can be indented with 3 spaces for commented blocks)
 MERMAID_PATTERN = re.compile(
-    r'^\.\. mermaid::\s*\n((?:(?:\n|   .*))*)',
+    r'^(?:   )?\.\. mermaid::\s*\n((?:(?:\n|   .*))*)',
    re.MULTILINE
 )

--- a/docs/conceptual/ck_tile/convert_raw_html_to_commented.py
+++ b/docs/conceptual/ck_tile/convert_raw_html_to_commented.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""Convert raw HTML mermaid blocks to commented format for SVG conversion."""
+
+import os
+import re
+
+def convert_raw_html_to_commented(content):
+    """Convert raw HTML mermaid blocks to commented mermaid format."""
+    
+    # Pattern to match raw HTML mermaid blocks
+    pattern = r'\.\. raw:: html\n\n   <div class="mermaid"[^>]*>\n(.*?)\n   </div>'
+    
+    def replace_block(match):
+        mermaid_code = match.group(1)
+        # The mermaid code in HTML has 3-space indentation, keep it
+        # but add 3 more spaces for .. mermaid:: indentation
+        mermaid_lines = mermaid_code.split('\n')
+        properly_indented = []
+        for line in mermaid_lines:
+            if line.strip():  # Non-empty line
+                # Line already has 3 spaces from HTML, add 3 more for mermaid block
+                properly_indented.append('   ' + line)
+            else:
+                properly_indented.append('')
+        
+        indented_code = '\n'.join(properly_indented)
+        
+        # Create commented format matching the expected pattern
+        commented = f""".. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+   .. mermaid::
+   
+{indented_code}
+   
+   
+"""
+        return commented
+    
+    return re.sub(pattern, replace_block, content, flags=re.DOTALL)
+
+def main():
+    """Process files with raw HTML mermaid blocks."""
+    
+    files_to_convert = [
+        'introduction_motivation.rst',
+        'buffer_views.rst',
+        'tensor_views.rst',
+        'coordinate_systems.rst',
+        'tile_distribution.rst'
+    ]
+    
+    converted_files = []
+    
+    for filename in files_to_convert:
+        if not os.path.exists(filename):
+            print(f'Skipping {filename} - not found')
+            continue
+        
+        with open(filename, 'r', encoding='utf-8') as f:
+            original = f.read()
+        
+        converted = convert_raw_html_to_commented(original)
+        
+        if converted != original:
+            with open(filename, 'w', encoding='utf-8') as f:
+                f.write(converted)
+            
+            blocks_converted = original.count('.. raw:: html')
+            converted_files.append((filename, blocks_converted))
+            print(f'✓ Converted {filename}: {blocks_converted} blocks')
+        else:
+            print(f'  {filename}: no raw HTML blocks found')
+    
+    print(f'\n=== CONVERSION COMPLETE ===')
+    print(f'Files converted: {len(converted_files)}')
+    print(f'Total blocks: {sum(c for _, c in converted_files)}')
+    print('\nNext: Run convert_mermaid_to_svg.py to generate SVG files')
+
+if __name__ == '__main__':
+    main()
--- a/docs/conceptual/ck_tile/convolution_example.rst
+++ b/docs/conceptual/ck_tile/convolution_example.rst
@@ -18,42 +18,50 @@ The key insight is that convolution can be transformed from a complex nested loo
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "Convolution Process"
-              I["Input Image<br/>6×6"]
-              K["Kernel<br/>3×3"]
-              SW["Sliding Window<br/>Extract 3×3 patches"]
-              DP["Dot Product<br/>Element-wise multiply & sum"]
-              O["Output<br/>4×4"]
-          end
-          
-          subgraph "Im2col Optimization"
-              W["Windows Matrix<br/>16×9<br/>(all patches)"]
-              KF["Kernel Flattened<br/>9×1"]
-              MM["Matrix Multiply<br/>W @ K"]
-              OF["Output Flattened<br/>16×1"]
-          end
-          
-          I --> SW
-          K --> DP
-          SW --> DP
-          DP --> O
-          
-          SW --> W
-          K --> KF
-          W --> MM
-          KF --> MM
-          MM --> OF
-          OF --> O
-          
-          style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      .. mermaid::
+      
+         graph TB
+             subgraph "Convolution Process"
+                 I["Input Image<br/>6×6"]
+                 K["Kernel<br/>3×3"]
+                 SW["Sliding Window<br/>Extract 3×3 patches"]
+                 DP["Dot Product<br/>Element-wise multiply & sum"]
+                 O["Output<br/>4×4"]
+             end
+             
+             subgraph "Im2col Optimization"
+                 W["Windows Matrix<br/>16×9<br/>(all patches)"]
+                 KF["Kernel Flattened<br/>9×1"]
+                 MM["Matrix Multiply<br/>W @ K"]
+                 OF["Output Flattened<br/>16×1"]
+             end
+             
+             I --> SW
+             K --> DP
+             SW --> DP
+             DP --> O
+             
+             SW --> W
+             K --> KF
+             W --> MM
+             KF --> MM
+             MM --> OF
+             OF --> O
+             
+             style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/convolution_example.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/convolution_example.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/coordinate_movement.rst
+++ b/docs/conceptual/ck_tile/coordinate_movement.rst
@@ -20,36 +20,44 @@ For the mathematical foundations of coordinate systems, see :ref:`ck_tile_coordi
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
-   
-      graph TB
-          subgraph "Coordinate Movement System"
-              TC["TensorCoordinate<br/>Position + Descriptor Context"]
-              TAC["TensorAdaptorCoordinate<br/>Position + Transform Context"]
-              MC["move_coordinate()<br/>Efficient Navigation"]
-          end
-          
-          subgraph "Movement Example"
-              S["Start: [1,1]<br/>Offset: 5"]
-              M1["Move [0,1]<br/>→ [1,2]<br/>Offset: 6"]
-              M2["Move [1,0]<br/>→ [2,2]<br/>Offset: 10"]
-              M3["Move [1,1]<br/>→ [3,3]<br/>Offset: 15"]
-          end
-          
-          TC --> MC
-          TAC --> MC
-          
-          S --> M1
-          M1 --> M2
-          M2 --> M3
-          
-          style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-          style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Coordinate Movement System"
+                 TC["TensorCoordinate<br/>Position + Descriptor Context"]
+                 TAC["TensorAdaptorCoordinate<br/>Position + Transform Context"]
+                 MC["move_coordinate()<br/>Efficient Navigation"]
+             end
+             
+             subgraph "Movement Example"
+                 S["Start: [1,1]<br/>Offset: 5"]
+                 M1["Move [0,1]<br/>→ [1,2]<br/>Offset: 6"]
+                 M2["Move [1,0]<br/>→ [2,2]<br/>Offset: 10"]
+                 M3["Move [1,1]<br/>→ [3,3]<br/>Offset: 15"]
+             end
+             
+             TC --> MC
+             TAC --> MC
+             
+             S --> M1
+             M1 --> M2
+             M2 --> M3
+             
+             style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+      
   
   

+.. image:: diagrams/coordinate_movement.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/coordinate_movement.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/coordinate_systems.rst
+++ b/docs/conceptual/ck_tile/coordinate_systems.rst
@@ -15,39 +15,51 @@ The Five Coordinate Spaces

 The CK framework employs five interconnected coordinate spaces, each serving a specific purpose in the journey from thread identification to memory access. These spaces work together to solve the fundamental challenge of GPU programming: efficiently distributing work across thousands of parallel threads while maintaining optimal memory access patterns.

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Coordinate Spaces Overview"
-           P["P-space<br/>Thread Identification<br/>Which thread am I?"]
-           Y["Y-space<br/>Logical Tile<br/>Which element in my tile?"]
-           X["X-space<br/>Physical Tensor<br/>Where in the tensor?"]
-           R["R-space<br/>Replication<br/>Data sharing pattern"]
-           D["D-space<br/>Linear Storage<br/>Memory address"]
-       end
-       
-       subgraph "Transformations"
-           T1["P + Y → X<br/>Thread + Element → Position"]
-           T2["X → D<br/>Position → Address"]
-       end
-       
-       P --> T1
-       Y --> T1
-       T1 --> X
-       X --> T2
-       T2 --> D
-       
-       R -.-> P
-       R -.-> Y
-       
-       style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-       style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-       style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
-       style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Coordinate Spaces Overview"
+                 P["P-space<br/>Thread Identification<br/>Which thread am I?"]
+                 Y["Y-space<br/>Logical Tile<br/>Which element in my tile?"]
+                 X["X-space<br/>Physical Tensor<br/>Where in the tensor?"]
+                 R["R-space<br/>Replication<br/>Data sharing pattern"]
+                 D["D-space<br/>Linear Storage<br/>Memory address"]
+             end
+   
+             subgraph "Transformations"
+                 T1["P + Y → X<br/>Thread + Element → Position"]
+                 T2["X → D<br/>Position → Address"]
+             end
+   
+             P --> T1
+             Y --> T1
+             T1 --> X
+             X --> T2
+             T2 --> D
+   
+             R -.-> P
+             R -.-> Y
+   
+             style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+             style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/coordinate_systems_1.svg
+   :alt: Diagram
+   :align: center
 The Challenge and Solution
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -69,41 +81,53 @@ P-space (Partition Space) represents the foundation of the coordinate system hie
 GPU Thread Hierarchy
 ~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "GPU Thread Hierarchy"
-           subgraph "Block"
-               subgraph "Warp 0"
-                   T0["Thread 0<br/>P=[0,0]"]
-                   T1["Thread 1<br/>P=[0,1]"]
-                   T2["Thread 2<br/>P=[0,2]"]
-                   T31["..."]
-                   T3["Thread 31<br/>P=[0,31]"]
-               end
-               subgraph "Warp 1"
-                   T32["Thread 32<br/>P=[1,0]"]
-                   T33["Thread 33<br/>P=[1,1]"]
-                   T34["..."]
-                   T63["Thread 63<br/>P=[1,31]"]
-               end
-               W2["Warp 2..."]
-               W7["Warp 7"]
-           end
-       end
-       
-       subgraph "P-space Mapping"
-           PM["P-coordinates = [warp_id, lane_id]<br/>or<br/>P-coordinates = [block_x, block_y, thread_x, thread_y]"]
-       end
-       
-       T0 --> PM
-       T32 --> PM
-       
-       style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-       style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "GPU Thread Hierarchy"
+                 subgraph "Block"
+                     subgraph "Warp 0"
+                         T0["Thread 0<br/>P=[0,0]"]
+                         T1["Thread 1<br/>P=[0,1]"]
+                         T2["Thread 2<br/>P=[0,2]"]
+                         T31["..."]
+                         T3["Thread 31<br/>P=[0,31]"]
+                     end
+                     subgraph "Warp 1"
+                         T32["Thread 32<br/>P=[1,0]"]
+                         T33["Thread 33<br/>P=[1,1]"]
+                         T34["..."]
+                         T63["Thread 63<br/>P=[1,31]"]
+                     end
+                     W2["Warp 2..."]
+                     W7["Warp 7"]
+                 end
+             end
+   
+             subgraph "P-space Mapping"
+                 PM["P-coordinates = [warp_id, lane_id]<br/>or<br/>P-coordinates = [block_x, block_y, thread_x, thread_y]"]
+             end
+   
+             T0 --> PM
+             T32 --> PM
+   
+             style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/coordinate_systems_2.svg
+   :alt: Diagram
+   :align: center
 The structure of P-space directly reflects the :ref:`hardware organization <ck_tile_gpu_basics>` of modern GPUs. Each thread receives a unique P-coordinate that encodes its position within the execution hierarchy. For simple distributions, P-space might be one-dimensional, containing only a thread ID. For complex hierarchical distributions, P-space can have multiple dimensions representing different levels of the GPU's thread organization.

 C++ Implementation
@@ -147,44 +171,56 @@ Y-space (Yield Space) represents the logical organization of work within each th
 Work Assignment Structure
 ~~~~~~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Thread's Tile (2x2 elements)"
-           Y00["Y=[0,0]<br/>Element 0"]
-           Y01["Y=[0,1]<br/>Element 1"]
-           Y10["Y=[1,0]<br/>Element 2"]
-           Y11["Y=[1,1]<br/>Element 3"]
-       end
-       
-       subgraph "Y-space Structure"
-           YS["Each thread processes<br/>the same Y-space pattern<br/>but at different X locations"]
-       end
-       
-       subgraph "Example: 4 Threads"
-           T0["Thread 0<br/>P=[0,0]"]
-           T1["Thread 1<br/>P=[0,1]"]
-           T2["Thread 2<br/>P=[1,0]"]
-           T3["Thread 3<br/>P=[1,1]"]
-       end
-       
-       Y00 --> YS
-       Y01 --> YS
-       Y10 --> YS
-       Y11 --> YS
-       
-       T0 --> YS
-       T1 --> YS
-       T2 --> YS
-       T3 --> YS
-       
-       style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Thread's Tile (2x2 elements)"
+                 Y00["Y=[0,0]<br/>Element 0"]
+                 Y01["Y=[0,1]<br/>Element 1"]
+                 Y10["Y=[1,0]<br/>Element 2"]
+                 Y11["Y=[1,1]<br/>Element 3"]
+             end
+   
+             subgraph "Y-space Structure"
+                 YS["Each thread processes<br/>the same Y-space pattern<br/>but at different X locations"]
+             end
+   
+             subgraph "Example: 4 Threads"
+                 T0["Thread 0<br/>P=[0,0]"]
+                 T1["Thread 1<br/>P=[0,1]"]
+                 T2["Thread 2<br/>P=[1,0]"]
+                 T3["Thread 3<br/>P=[1,1]"]
+             end
+   
+             Y00 --> YS
+             Y01 --> YS
+             Y10 --> YS
+             Y11 --> YS
+   
+             T0 --> YS
+             T1 --> YS
+             T2 --> YS
+             T3 --> YS
+   
+             style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/coordinate_systems_3.svg
+   :alt: Diagram
+   :align: center
 The power of Y-space lies in its ability to express different iteration patterns without changing the underlying distribution logic. A thread might traverse its Y-space in row-major order for one algorithm, column-major for another, or even use :ref:`space-filling curves <ck_tile_space_filling_curve>` for optimal cache utilization. This flexibility enables algorithm-specific optimizations while maintaining a consistent framework.

 Hierarchical Y-Space
@@ -265,37 +301,49 @@ The transformation from P and Y coordinates to X coordinates represents the hear
 Transformation Pipeline
 ~~~~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   graph LR
-       subgraph "Input"
-           P["P-coordinates<br/>Thread identity<br/>P=[1,0]"]
-           Y["Y-coordinates<br/>Element in tile<br/>Y=[0,1]"]
-       end
-       
-       subgraph "Transformation"
-           T["P + Y → X<br/>Base position + Offset"]
-       end
-       
-       subgraph "Output"
-           X["X-coordinates<br/>Tensor position<br/>X=[2,1]"]
-       end
-       
-       subgraph "Example"
-           E["Thread P=[1,0] at base (2,0)<br/>Element Y=[0,1] adds offset (0,1)<br/>Result X=[2,1] in tensor"]
-       end
-       
-       P --> T
-       Y --> T
-       T --> X
-       X --> E
-       
-       style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-       style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Input"
+                 P["P-coordinates<br/>Thread identity<br/>P=[1,0]"]
+                 Y["Y-coordinates<br/>Element in tile<br/>Y=[0,1]"]
+             end
+   
+             subgraph "Transformation"
+                 T["P + Y → X<br/>Base position + Offset"]
+             end
+   
+             subgraph "Output"
+                 X["X-coordinates<br/>Tensor position<br/>X=[2,1]"]
+             end
+   
+             subgraph "Example"
+                 E["Thread P=[1,0] at base (2,0)<br/>Element Y=[0,1] adds offset (0,1)<br/>Result X=[2,1] in tensor"]
+             end
+   
+             P --> T
+             Y --> T
+             T --> X
+             X --> E
+   
+             style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/coordinate_systems_4.svg
+   :alt: Diagram
+   :align: center
 Mathematical Foundation
 ~~~~~~~~~~~~~~~~~~~~~~~

@@ -361,33 +409,45 @@ D-space represents the final transformation in the coordinate pipeline—convert
 Linearization Strategies
 ~~~~~~~~~~~~~~~~~~~~~~~~

-.. raw:: html
-
-   <div class="mermaid">
-   graph LR
-       subgraph "X-coordinates"
-           X["X = [2, 3]<br/>2D Position"]
-       end
-       
-       subgraph "Layout Options"
-           RM["Row-Major<br/>D = 2×width + 3"]
-           CM["Column-Major<br/>D = 3×height + 2"]
-           BL["Blocked<br/>Complex pattern"]
-       end
-       
-       subgraph "D-coordinate"
-           D["D = 11<br/>Linear Address"]
-       end
-       
-       X --> RM
-       X --> CM
-       X --> BL
-       RM --> D
-       
-       style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-       style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "X-coordinates"
+                 X["X = [2, 3]<br/>2D Position"]
+             end
+   
+             subgraph "Layout Options"
+                 RM["Row-Major<br/>D = 2×width + 3"]
+                 CM["Column-Major<br/>D = 3×height + 2"]
+                 BL["Blocked<br/>Complex pattern"]
+             end
+   
+             subgraph "D-coordinate"
+                 D["D = 11<br/>Linear Address"]
+             end
+   
+             X --> RM
+             X --> CM
+             X --> BL
+             RM --> D
+   
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/coordinate_systems_5.svg
+   :alt: Diagram
+   :align: center
 The linearization process must consider multiple factors:

 .. code-block:: cpp
@@ -417,48 +477,60 @@ Complete Pipeline Example

 Let's trace through a complete example showing how all coordinate spaces work together:

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Step 1: Thread Identification"
-           TID["Thread ID = 5"]
-           P["P-coordinates<br/>P = [0, 5]<br/>(warp 0, lane 5)"]
-       end
-       
-       subgraph "Step 2: Work Assignment"
-           Y["Y-coordinates<br/>Y = [1, 0]<br/>(element in tile)"]
-       end
-       
-       subgraph "Step 3: P+Y Transformation"
-           TRANS["P + Y → X<br/>Thread position + Element offset"]
-           X["X-coordinates<br/>X = [1, 5]<br/>(tensor position)"]
-       end
-       
-       subgraph "Step 4: Linearization"
-           LIN["X → D<br/>Row-major: D = x₀ × width + x₁"]
-           D["D-coordinate<br/>D = 13<br/>(memory address)"]
-       end
-       
-       subgraph "Step 5: Memory Access"
-           MEM["Hardware accesses<br/>memory[13]"]
-       end
-       
-       TID --> P
-       P --> TRANS
-       Y --> TRANS
-       TRANS --> X
-       X --> LIN
-       LIN --> D
-       D --> MEM
-       
-       style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-       style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-       style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
-       style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
-       style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Step 1: Thread Identification"
+                 TID["Thread ID = 5"]
+                 P["P-coordinates<br/>P = [0, 5]<br/>(warp 0, lane 5)"]
+             end
+   
+             subgraph "Step 2: Work Assignment"
+                 Y["Y-coordinates<br/>Y = [1, 0]<br/>(element in tile)"]
+             end
+   
+             subgraph "Step 3: P+Y Transformation"
+                 TRANS["P + Y → X<br/>Thread position + Element offset"]
+                 X["X-coordinates<br/>X = [1, 5]<br/>(tensor position)"]
+             end
+   
+             subgraph "Step 4: Linearization"
+                 LIN["X → D<br/>Row-major: D = x₀ × width + x₁"]
+                 D["D-coordinate<br/>D = 13<br/>(memory address)"]
+             end
+   
+             subgraph "Step 5: Memory Access"
+                 MEM["Hardware accesses<br/>memory[13]"]
+             end
+   
+             TID --> P
+             P --> TRANS
+             Y --> TRANS
+             TRANS --> X
+             X --> LIN
+             LIN --> D
+             D --> MEM
+   
+             style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
+             style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
+             style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
+      
+      
+   
+   
+   

+.. image:: diagrams/coordinate_systems_6.svg
+   :alt: Diagram
+   :align: center
 Real-World Example: Matrix Multiplication
 -----------------------------------------

--- a/docs/conceptual/ck_tile/descriptors.rst
+++ b/docs/conceptual/ck_tile/descriptors.rst
@@ -99,35 +99,43 @@ Every TensorDescriptor in CK Tile can be thought of as a **transformation pipeli
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph LR
-          subgraph "Pipeline Stages"
-              S1["Stage 1<br/>Base Layout<br/>[M, N]"] 
-              S2["Stage 2<br/>Transform<br/>Unmerge"]
-              S3["Stage 3<br/>New View<br/>[M1, M2, N]"]
-              S4["Stage N<br/>Final View<br/>[...]"]
-          end
-          
-          subgraph "Same Data"
-              D["Physical Memory<br/>No data movement"]
-          end
-          
-          S1 --> S2
-          S2 --> S3
-          S3 --> S4
-          
-          S1 -.-> D
-          S2 -.-> D
-          S3 -.-> D
-          S4 -.-> D
-          
-          style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
-          style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      .. mermaid::
+      
+         graph LR
+             subgraph "Pipeline Stages"
+                 S1["Stage 1<br/>Base Layout<br/>[M, N]"] 
+                 S2["Stage 2<br/>Transform<br/>Unmerge"]
+                 S3["Stage 3<br/>New View<br/>[M1, M2, N]"]
+                 S4["Stage N<br/>Final View<br/>[...]"]
+             end
+             
+             subgraph "Same Data"
+                 D["Physical Memory<br/>No data movement"]
+             end
+             
+             S1 --> S2
+             S2 --> S3
+             S3 --> S4
+             
+             S1 -.-> D
+             S2 -.-> D
+             S3 -.-> D
+             S4 -.-> D
+             
+             style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
+             style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/descriptors_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/descriptors_1.svg
   :alt: Diagram
   :align: center
@@ -197,40 +205,48 @@ Analysis of the Final Pipeline
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "Transform Pipeline"
-              T0["Transform 0<br/>Base Unmerge<br/>Input: [0]<br/>Output: [1,2]"]
-              T1["Transform 1<br/>PassThrough<br/>Input: [1]<br/>Output: [3]"]
-              T2["Transform 2<br/>Unmerge<br/>Input: [2]<br/>Output: [4,5]"]
-          end
-          
-          subgraph "Hidden Dimensions"
-              H0["Hidden ID 0<br/>Raw Buffer"]
-              H1["Hidden ID 1<br/>Dim 0 (size 2)"]
-              H2["Hidden ID 2<br/>Dim 1 (size 6)"]
-              H3["Hidden ID 3<br/>Final Dim 0"]
-              H4["Hidden ID 4<br/>Final Dim 1"]
-              H5["Hidden ID 5<br/>Final Dim 2"]
-          end
-          
-          H0 --> T0
-          T0 --> H1
-          T0 --> H2
-          H1 --> T1
-          H2 --> T2
-          T1 --> H3
-          T2 --> H4
-          T2 --> H5
-          
-          style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
-          style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      .. mermaid::
+      
+         graph TB
+             subgraph "Transform Pipeline"
+                 T0["Transform 0<br/>Base Unmerge<br/>Input: [0]<br/>Output: [1,2]"]
+                 T1["Transform 1<br/>PassThrough<br/>Input: [1]<br/>Output: [3]"]
+                 T2["Transform 2<br/>Unmerge<br/>Input: [2]<br/>Output: [4,5]"]
+             end
+             
+             subgraph "Hidden Dimensions"
+                 H0["Hidden ID 0<br/>Raw Buffer"]
+                 H1["Hidden ID 1<br/>Dim 0 (size 2)"]
+                 H2["Hidden ID 2<br/>Dim 1 (size 6)"]
+                 H3["Hidden ID 3<br/>Final Dim 0"]
+                 H4["Hidden ID 4<br/>Final Dim 1"]
+                 H5["Hidden ID 5<br/>Final Dim 2"]
+             end
+             
+             H0 --> T0
+             T0 --> H1
+             T0 --> H2
+             H1 --> T1
+             H2 --> T2
+             T1 --> H3
+             T2 --> H4
+             T2 --> H5
+             
+             style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
+             style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/descriptors_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/descriptors_2.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg
--- a/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg
--- a/docs/conceptual/ck_tile/diagrams/convolution_example.svg
+++ b/docs/conceptual/ck_tile/diagrams/convolution_example.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg
--- a/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg
--- a/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg
--- a/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg
--- a/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg
--- a/docs/conceptual/ck_tile/diagrams/tile_window_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_window_2.svg
--- a/docs/conceptual/ck_tile/introduction_motivation.rst
+++ b/docs/conceptual/ck_tile/introduction_motivation.rst
@@ -17,75 +17,81 @@ In this introduction, we establish the fundamental problems that tile distributi
 The GPU Memory Problem
 ----------------------

-.. raw:: html
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+   .. mermaid::
+   
+      graph TB
+      subgraph "Random Access Pattern (Inefficient)"
+          subgraph "Threads"
+              T0_R["Thread 0"]
+              T1_R["Thread 1"] 
+              T2_R["Thread 2"]
+              T3_R["Thread 3"]
+          end

-   <div class="mermaid" style="margin: 0 auto; display: block; width: 60%;">
-       graph TB
-       subgraph "Random Access Pattern (Inefficient)"
-           subgraph "Threads"
-               T0_R["Thread 0"]
-               T1_R["Thread 1"] 
-               T2_R["Thread 2"]
-               T3_R["Thread 3"]
-           end
-           
-           subgraph "Memory"
-               M0["Mem[0]"]
-               M7["Mem[7]"]
-               M15["Mem[15]"]
-               M23["Mem[23]"]
-               M31["Mem[31]"]
-               M39["Mem[39]"]
-               M47["Mem[47]"]
-               M55["Mem[55]"]
-           end
-           
-           T0_R -.-> M23
-           T1_R -.-> M7
-           T2_R -.-> M47
-           T3_R -.-> M15
-       end
-       
-       subgraph "Tile Distribution Pattern (Efficient)"
-           subgraph "Threads_TD"
-               T0_TD["Thread 0"]
-               T1_TD["Thread 1"]
-               T2_TD["Thread 2"]
-               T3_TD["Thread 3"]
-           end
-           
-           subgraph "Memory_TD"
-               M0_TD["Mem[0]"]
-               M1_TD["Mem[1]"]
-               M2_TD["Mem[2]"]
-               M3_TD["Mem[3]"]
-               M4_TD["Mem[4]"]
-               M5_TD["Mem[5]"]
-               M6_TD["Mem[6]"]
-               M7_TD["Mem[7]"]
-           end
-           
-           T0_TD --> M0_TD
-           T0_TD --> M1_TD
-           T1_TD --> M2_TD
-           T1_TD --> M3_TD
-           T2_TD --> M4_TD
-           T2_TD --> M5_TD
-           T3_TD --> M6_TD
-           T3_TD --> M7_TD
-       end
-       
-       style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       
-       style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
-   </div>
+          subgraph "Memory"
+              M0["Mem[0]"]
+              M7["Mem[7]"]
+              M15["Mem[15]"]
+              M23["Mem[23]"]
+              M31["Mem[31]"]
+              M39["Mem[39]"]
+              M47["Mem[47]"]
+              M55["Mem[55]"]
+          end

+          T0_R -.-> M23
+          T1_R -.-> M7
+          T2_R -.-> M47
+          T3_R -.-> M15
+      end
+
+      subgraph "Tile Distribution Pattern (Efficient)"
+          subgraph "Threads_TD"
+              T0_TD["Thread 0"]
+              T1_TD["Thread 1"]
+              T2_TD["Thread 2"]
+              T3_TD["Thread 3"]
+          end
+
+          subgraph "Memory_TD"
+              M0_TD["Mem[0]"]
+              M1_TD["Mem[1]"]
+              M2_TD["Mem[2]"]
+              M3_TD["Mem[3]"]
+              M4_TD["Mem[4]"]
+              M5_TD["Mem[5]"]
+              M6_TD["Mem[6]"]
+              M7_TD["Mem[7]"]
+          end
+
+          T0_TD --> M0_TD
+          T0_TD --> M1_TD
+          T1_TD --> M2_TD
+          T1_TD --> M3_TD
+          T2_TD --> M4_TD
+          T2_TD --> M5_TD
+          T3_TD --> M6_TD
+          T3_TD --> M7_TD
+      end
+
+      style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+      style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+      style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+      style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+
+      style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+   
+   
+
+.. image:: diagrams/introduction_motivation_1.svg
+   :alt: Diagram
+   :align: center
 Why Random Memory Access is Slow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -209,36 +215,42 @@ The Coordinate Mapping Insight

 At the heart of tile distribution lies a profound mathematical insight: efficient GPU computation requires a systematic framework for mapping between different coordinate spaces. This framework transforms the complex problem of thread-to-data assignment into a series of well-defined mathematical transformations, each serving a specific purpose in the journey from abstract algorithm to concrete hardware execution.

-.. raw:: html
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+   .. mermaid::
+   
+      graph LR
+          subgraph "Coordinate Spaces"
+              P["P-space<br/>Thread Position<br/>(thread_x, thread_y,<br/>warp_id, block_id)"]
+              Y["Y-space<br/>Local Data<br/>(y0, y1, y2, y3)"]
+              X["X-space<br/>Global Position<br/>(x0, x1)"]
+              D["D-space<br/>Memory Address<br/>(linearized)"]
+          end

-   <div class="mermaid">
-   graph LR
-       subgraph "Coordinate Spaces"
-           P["P-space<br/>Thread Position<br/>(thread_x, thread_y,<br/>warp_id, block_id)"]
-           Y["Y-space<br/>Local Data<br/>(y0, y1, y2, y3)"]
-           X["X-space<br/>Global Position<br/>(x0, x1)"]
-           D["D-space<br/>Memory Address<br/>(linearized)"]
-       end
-       
-       subgraph "Transformations"
-           T1["P + Y → X<br/>Thread data mapping"]
-           T2["X → D<br/>Memory linearization"]
-       end
-       
-       P --> T1
-       Y --> T1
-       T1 --> X
-       X --> T2
-       T2 --> D
-       
-       style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-       style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-       style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
-       style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-       style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-   </div>
+          subgraph "Transformations"
+              T1["P + Y → X<br/>Thread data mapping"]
+              T2["X → D<br/>Memory linearization"]
+          end

+          P --> T1
+          Y --> T1
+          T1 --> X
+          X --> T2
+          T2 --> D
+
+          style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+          style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+          style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+          style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+          style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+          style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+   
+   
+
+.. image:: diagrams/introduction_motivation_2.svg
+   :alt: Diagram
+   :align: center
 The elegance of this approach emerges from its separation of concerns. Each coordinate space represents a distinct aspect of the computation, and the transformations between them encapsulate specific optimization strategies. This separation allows developers to reason about their algorithms in natural terms while the framework handles the complex mapping to efficient hardware execution patterns.

 **P-space (Thread Position Space)** represents the physical organization of threads on the GPU. This space captures the hierarchical nature of GPU execution, from individual threads identified by their x and y coordinates within a block, to warps that execute in lockstep, to thread blocks that share resources. The coordinates in P-space—thread_x, thread_y, warp_id, and block_id—directly correspond to the hardware's execution model. Understanding P-space is crucial because it determines which threads can cooperate efficiently through shared memory and which threads will execute their memory accesses simultaneously.
--- a/docs/conceptual/ck_tile/lds_index_swapping.rst
+++ b/docs/conceptual/ck_tile/lds_index_swapping.rst
@@ -28,42 +28,50 @@ The original K coordinate is split into K0 and K1, where K1 represents the threa
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "3D LDS coordinate [K0, M, K1]"
-              K0["KPerBlock/KPack * MLdsLayer<br/>K0"]
-              M["MPerBlock/MLdsLayer<br/>M"]
-              K1["KPack<br/>K1"]
-          end
+      .. mermaid::
      
-          subgraph "XOR Transform"
-              XT["make_xor_transform"]
-          end
+         graph TB
+             subgraph "3D LDS coordinate [K0, M, K1]"
+                 K0["KPerBlock/KPack * MLdsLayer<br/>K0"]
+                 M["MPerBlock/MLdsLayer<br/>M"]
+                 K1["KPack<br/>K1"]
+             end
+         
+             subgraph "XOR Transform"
+                 XT["make_xor_transform"]
+             end
+         
+             subgraph "Update K0 with XOR transformation"
+                 K01["KPerBlock/KPack * MLdsLayer<br/>K0'"]
+                 M1["MPerBlock/MLdsLayer<br/>M"]
+                 K11["KPack<br/>K1"]
+             end
+         
+             K0 --> XT
+             M --> XT
+             K1 --> K11
+         
+             XT --> K01
+             XT --> M1
+             
+             style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+         
+             style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
      
-          subgraph "Update K0 with XOR transformation"
-              K01["KPerBlock/KPack * MLdsLayer<br/>K0'"]
-              M1["MPerBlock/MLdsLayer<br/>M"]
-              K11["KPack<br/>K1"]
-          end
      
-          K0 --> XT
-          M --> XT
-          K1 --> K11
-      
-          XT --> K01
-          XT --> M1
-          
-          style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-      
-          style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-          style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
   
   

+.. image:: diagrams/lds_index_swapping_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/lds_index_swapping_1.svg
   :alt: Diagram
   :align: center
@@ -83,45 +91,53 @@ The transformed K0' is split into L and K0'' components, creating an intermediat
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "3D LDS coordinate [K0', M, K1]"
-              K0["KPerBlock/KPack * MLdsLayer<br/>K0'"]
-              M["MPerBlock/MLdsLayer<br/>M"]
-              K1["KPack<br/>K1"]
-          end
+      .. mermaid::
      
-          subgraph "Unmerge into 2 components"
-              UM["make_unmerge_transform"]
-          end
+         graph TB
+             subgraph "3D LDS coordinate [K0', M, K1]"
+                 K0["KPerBlock/KPack * MLdsLayer<br/>K0'"]
+                 M["MPerBlock/MLdsLayer<br/>M"]
+                 K1["KPack<br/>K1"]
+             end
+         
+             subgraph "Unmerge into 2 components"
+                 UM["make_unmerge_transform"]
+             end
+         
+             subgraph "4D intermediate transformation space"
+                 L["MLdsLayer<br/>L"]
+                 M1["MPerBlock/MLdsLayer<br/>M"]
+                 K01["KPerBlock/KPack<br/>K0''"]
+                 K11["KPack<br/>K1"]
+             end
+         
+             K0 --> UM
+             M --> M1
+             K1 --> K11
+         
+             UM --> L
+             UM --> K01
+             
+             style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+         
+             style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+         
+             style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
      
-          subgraph "4D intermediate transformation space"
-              L["MLdsLayer<br/>L"]
-              M1["MPerBlock/MLdsLayer<br/>M"]
-              K01["KPerBlock/KPack<br/>K0''"]
-              K11["KPack<br/>K1"]
-          end
      
-          K0 --> UM
-          M --> M1
-          K1 --> K11
-      
-          UM --> L
-          UM --> K01
-          
-          style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-      
-          style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-      
-          style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-          style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
   
   

+.. image:: diagrams/lds_index_swapping_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/lds_index_swapping_2.svg
   :alt: Diagram
   :align: center
@@ -142,50 +158,58 @@ The final step merges the 4D coordinates back into 2D transformed coordinates (M
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "4D LDS Coordinates [L, M, K0'', K1]"
-              L["MLdsLayer<br/>L"]
-              M1["MPerBlock/MLdsLayer<br/>M"]
-              K0["KPerBlock/KPack<br/>K0''"]
-              K1["KPack<br/>K1"]
-          end
+      .. mermaid::
      
-          subgraph "Merge into 1 component"
-              ME0["make_merge_transform"]
-          end
+         graph TB
+             subgraph "4D LDS Coordinates [L, M, K0'', K1]"
+                 L["MLdsLayer<br/>L"]
+                 M1["MPerBlock/MLdsLayer<br/>M"]
+                 K0["KPerBlock/KPack<br/>K0''"]
+                 K1["KPack<br/>K1"]
+             end
+         
+             subgraph "Merge into 1 component"
+                 ME0["make_merge_transform"]
+             end
+         
+             subgraph "Merge into 1 component"
+                 ME1["make_merge_transform"]
+             end
+         
+             subgraph "Transformed 2D coordinates [M', K']"
+                 M11["MPerBlock<br/>M'"]
+                 K01["KPerBlock<br/>K'"]
+             end
+         
+             L --> ME0
+             M1 --> ME0
+         
+             K0 --> ME1
+             K1 --> ME1
+         
+             ME0 --> M11
+             ME1 --> K01    
+         
+             style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+         
+             style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
      
-          subgraph "Merge into 1 component"
-              ME1["make_merge_transform"]
-          end
      
-          subgraph "Transformed 2D coordinates [M', K']"
-              M11["MPerBlock<br/>M'"]
-              K01["KPerBlock<br/>K'"]
-          end
      
-          L --> ME0
-          M1 --> ME0
      
-          K0 --> ME1
-          K1 --> ME1
-      
-          ME0 --> M11
-          ME1 --> K01    
-      
-          style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-      
-          style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-   
-   
   
   

+.. image:: diagrams/lds_index_swapping_3.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/lds_index_swapping_3.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/load_store_traits.rst
+++ b/docs/conceptual/ck_tile/load_store_traits.rst
@@ -106,27 +106,35 @@ LoadStoreTraits employs a advanced algorithm to select the best dimension for ve
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TD
-          A[Analyze Distribution] --> B{Check Each Dimension}
-          B --> C[Calculate Stride]
-          C --> D{Stride == 1?}
-          D -->|Yes| E[Candidate for Vectorization]
-          D -->|No| F[Skip Dimension]
-          E --> G[Check Alignment]
-          G --> H[Check Vector Size]
-          H --> I[Score Dimension]
-          F --> B
-          I --> J[Select Best Dimension]
-          J --> K[Configure Vector Access]
-          
-          style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      .. mermaid::
+      
+         graph TD
+             A[Analyze Distribution] --> B{Check Each Dimension}
+             B --> C[Calculate Stride]
+             C --> D{Stride == 1?}
+             D -->|Yes| E[Candidate for Vectorization]
+             D -->|No| F[Skip Dimension]
+             E --> G[Check Alignment]
+             G --> H[Check Vector Size]
+             H --> I[Score Dimension]
+             F --> B
+             I --> J[Select Best Dimension]
+             J --> K[Configure Vector Access]
+             
+             style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/load_store_traits_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/load_store_traits_1.svg
   :alt: Diagram
   :align: center
@@ -174,36 +182,44 @@ LoadStoreTraits creates efficient access patterns using space-filling curves:
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph LR
-          subgraph "Linear Traversal"
-              L1["0→1→2→3"]
-              L2["4→5→6→7"]
-              L3["Cache miss"]
-              L4["8→9→10→11"]
-          end
-          
-          subgraph "Snake Pattern"
-              S1["0→1→2→3"]
-              S2["7←6←5←4"]
-              S3["Cache hit!"]
-              S4["8→9→10→11"]
-          end
-          
-          L1 --> L2
-          L2 --> L3
-          L3 --> L4
-          
-          S1 --> S2
-          S2 --> S3
-          S3 --> S4
-          
-          style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-          style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      .. mermaid::
+      
+         graph LR
+             subgraph "Linear Traversal"
+                 L1["0→1→2→3"]
+                 L2["4→5→6→7"]
+                 L3["Cache miss"]
+                 L4["8→9→10→11"]
+             end
+             
+             subgraph "Snake Pattern"
+                 S1["0→1→2→3"]
+                 S2["7←6←5←4"]
+                 S3["Cache hit!"]
+                 S4["8→9→10→11"]
+             end
+             
+             L1 --> L2
+             L2 --> L3
+             L3 --> L4
+             
+             S1 --> S2
+             S2 --> S3
+             S3 --> S4
+             
+             style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/load_store_traits_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/load_store_traits_2.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/space_filling_curve.rst
+++ b/docs/conceptual/ck_tile/space_filling_curve.rst
@@ -188,36 +188,44 @@ The snake pattern reverses traversal direction on alternate rows, minimizing the
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph LR
-          subgraph "Linear Pattern"
-              L1["Row 0: →"]
-              L2["Row 1: →"]
-              L3["Jump back"]
-              L4["Row 2: →"]
-          end
-          
-          subgraph "Snake Pattern"
-              S1["Row 0: →"]
-              S2["Row 1: ←"]
-              S3["Continue"]
-              S4["Row 2: →"]
-          end
-          
-          L1 --> L3
-          L3 --> L2
-          L2 --> L3
-          L3 --> L4
-          
-          S1 --> S2
-          S2 --> S4
-          
-          style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-          style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      .. mermaid::
+      
+         graph LR
+             subgraph "Linear Pattern"
+                 L1["Row 0: →"]
+                 L2["Row 1: →"]
+                 L3["Jump back"]
+                 L4["Row 2: →"]
+             end
+             
+             subgraph "Snake Pattern"
+                 S1["Row 0: →"]
+                 S2["Row 1: ←"]
+                 S3["Continue"]
+                 S4["Row 2: →"]
+             end
+             
+             L1 --> L3
+             L3 --> L2
+             L2 --> L3
+             L3 --> L4
+             
+             S1 --> S2
+             S2 --> S4
+             
+             style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/space_filling_curve.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/space_filling_curve.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/static_distributed_tensor.rst
+++ b/docs/conceptual/ck_tile/static_distributed_tensor.rst
@@ -88,21 +88,29 @@ The memory layout follows a hierarchical pattern:
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TD
-          A[Global Tensor 64x64] --> B[Thread Block 16x16]
-          B --> C[Thread 0,0<br/>Elements 0:3,0:3]
-          B --> D[Thread 0,1<br/>Elements 0:3,4:7]
-          B --> E[Thread 1,0<br/>Elements 4:7,0:3]
-          B --> F[...]
-          
-          C --> G[Local Array<br/>16 elements]
-          D --> H[Local Array<br/>16 elements]
-          E --> I[Local Array<br/>16 elements]
+      .. mermaid::
+      
+         graph TD
+             A[Global Tensor 64x64] --> B[Thread Block 16x16]
+             B --> C[Thread 0,0<br/>Elements 0:3,0:3]
+             B --> D[Thread 0,1<br/>Elements 0:3,4:7]
+             B --> E[Thread 1,0<br/>Elements 4:7,0:3]
+             B --> F[...]
+             
+             C --> G[Local Array<br/>16 elements]
+             D --> H[Local Array<br/>16 elements]
+             E --> I[Local Array<br/>16 elements]
+      
+      
   
   

+.. image:: diagrams/static_distributed_tensor.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/static_distributed_tensor.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/sweep_tile.rst
+++ b/docs/conceptual/ck_tile/sweep_tile.rst
@@ -18,35 +18,43 @@ The key insight is the "load once, use many times" pattern. Load X data once int
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      flowchart LR
-          subgraph "X-Tile (Reused)"
-              XT["X data loaded once<br/>Stays in registers"]
-          end
-          
-          subgraph "Y-Sweep"
-              Y1["Y position 0"]
-              Y2["Y position 1"]
-              Y3["Y position 2"]
-              YN["Y position N"]
-          end
-          
-          subgraph "Computation"
-              C["Process(X, Y)"]
-          end
-          
-          XT --> C
-          Y1 --> C
-          Y2 --> C
-          Y3 --> C
-          YN --> C
-          
-          style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-          style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+      .. mermaid::
+      
+         flowchart LR
+             subgraph "X-Tile (Reused)"
+                 XT["X data loaded once<br/>Stays in registers"]
+             end
+             
+             subgraph "Y-Sweep"
+                 Y1["Y position 0"]
+                 Y2["Y position 1"]
+                 Y3["Y position 2"]
+                 YN["Y position N"]
+             end
+             
+             subgraph "Computation"
+                 C["Process(X, Y)"]
+             end
+             
+             XT --> C
+             Y1 --> C
+             Y2 --> C
+             Y3 --> C
+             YN --> C
+             
+             style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/sweep_tile_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/sweep_tile_1.svg
   :alt: Diagram
   :align: center
@@ -122,30 +130,38 @@ The sweep pattern provides significant memory efficiency benefits:
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "Traditional Approach"
-              T1["Load X[0]"] --> P1["Process"]
-              T2["Load Y[0]"] --> P1
-              T3["Load X[0]"] --> P2["Process"]
-              T4["Load Y[1]"] --> P2
-              T5["Load X[0]"] --> P3["Process"]
-              T6["Load Y[2]"] --> P3
-              Note1["X loaded 3 times!"]
-          end
-          
-          subgraph "Sweep Approach"
-              S1["Load X[0]"] --> SP["Process with<br/>Y[0], Y[1], Y[2]"]
-              S2["Load Y[0,1,2]"] --> SP
-              Note2["X loaded once!"]
-          end
-          
-          style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-          style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      .. mermaid::
+      
+         graph TB
+             subgraph "Traditional Approach"
+                 T1["Load X[0]"] --> P1["Process"]
+                 T2["Load Y[0]"] --> P1
+                 T3["Load X[0]"] --> P2["Process"]
+                 T4["Load Y[1]"] --> P2
+                 T5["Load X[0]"] --> P3["Process"]
+                 T6["Load Y[2]"] --> P3
+                 Note1["X loaded 3 times!"]
+             end
+             
+             subgraph "Sweep Approach"
+                 S1["Load X[0]"] --> SP["Process with<br/>Y[0], Y[1], Y[2]"]
+                 S2["Load Y[0,1,2]"] --> SP
+                 Note2["X loaded once!"]
+             end
+             
+             style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/sweep_tile_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/sweep_tile_2.svg
   :alt: Diagram
   :align: center
@@ -381,37 +397,45 @@ Sweep operations provide several performance benefits:
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "Sweep Performance Benefits"
-              B1["Zero runtime overhead<br/>Compile-time unrolling"]
-              B2["Perfect memory coalescing<br/>Sequential access patterns"]
-              B3["Automatic vectorization<br/>Compiler optimizations"]
-              B4["Register reuse<br/>X data stays in VGPR"]
-          end
-          
-          subgraph "Use Cases"
-              U1["Matrix Multiplication<br/>Reuse A columns"]
-              U2["Convolution<br/>Reuse filter weights"]
-              U3["Reduction<br/>Accumulate over Y"]
-              U4["Broadcast<br/>Apply X to all Y"]
-          end
-          
-          B1 --> Performance["High Performance"]
-          B2 --> Performance
-          B3 --> Performance
-          B4 --> Performance
-          
-          Performance --> U1
-          Performance --> U2
-          Performance --> U3
-          Performance --> U4
-          
-          style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px
+      .. mermaid::
+      
+         graph TB
+             subgraph "Sweep Performance Benefits"
+                 B1["Zero runtime overhead<br/>Compile-time unrolling"]
+                 B2["Perfect memory coalescing<br/>Sequential access patterns"]
+                 B3["Automatic vectorization<br/>Compiler optimizations"]
+                 B4["Register reuse<br/>X data stays in VGPR"]
+             end
+             
+             subgraph "Use Cases"
+                 U1["Matrix Multiplication<br/>Reuse A columns"]
+                 U2["Convolution<br/>Reuse filter weights"]
+                 U3["Reduction<br/>Accumulate over Y"]
+                 U4["Broadcast<br/>Apply X to all Y"]
+             end
+             
+             B1 --> Performance["High Performance"]
+             B2 --> Performance
+             B3 --> Performance
+             B4 --> Performance
+             
+             Performance --> U1
+             Performance --> U2
+             Performance --> U3
+             Performance --> U4
+             
+             style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px
+      
+      
   
   

+.. image:: diagrams/sweep_tile_3.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/sweep_tile_3.svg
   :alt: Diagram
   :align: center
@@ -453,28 +477,36 @@ Complete workflow example:
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      flowchart TB
-          subgraph "Complete Workflow"
-              TD["TileDistribution<br/>Define data layout"]
-              TW["TileWindow<br/>Create view"]
-              DT["DistributedTensor<br/>Load X data"]
-              ST["SweepTile<br/>Iterate Y positions"]
-              R["Results<br/>Store outputs"]
-          end
-          
-          TD --> TW
-          TW --> DT
-          DT --> ST
-          ST --> R
-          
-          style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-          style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      .. mermaid::
+      
+         flowchart TB
+             subgraph "Complete Workflow"
+                 TD["TileDistribution<br/>Define data layout"]
+                 TW["TileWindow<br/>Create view"]
+                 DT["DistributedTensor<br/>Load X data"]
+                 ST["SweepTile<br/>Iterate Y positions"]
+                 R["Results<br/>Store outputs"]
+             end
+             
+             TD --> TW
+             TW --> DT
+             DT --> ST
+             ST --> R
+             
+             style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/sweep_tile_4.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/sweep_tile_4.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/tensor_views.rst
+++ b/docs/conceptual/ck_tile/tensor_views.rst
@@ -13,40 +13,52 @@ The power of TensorView lies in its ability to present different logical views o
 TensorView Architecture
 -----------------------

-.. raw:: html
-
-   <div class="mermaid" style="margin: 0 auto; display: block; width: 60%;">
-   graph TB
-       subgraph "Memory Foundation"
-           Memory["Flat Memory Array<br/>0 1 2 3 4 5 6 7 8 9 10 11"]
-       end
-       
-       subgraph "Access Layer"
-           BufferView["BufferView<br/>Linear Memory Access"]
-           Descriptor["TensorDescriptor<br/>Shape & Stride Info"]
-       end
-       
-       subgraph "Tensor Layer"
-           TensorView["TensorView<br/>Multi-dimensional Access"]
-       end
-       
-       subgraph "Logical View"
-           Matrix["2D Matrix View<br/>[3×4]<br/>[[0,1,2,3]<br/>[4,5,6,7]<br/>[8,9,10,11]]"]
-       end
-       
-       Memory --> BufferView
-       Memory --> Descriptor
-       BufferView --> TensorView
-       Descriptor --> TensorView
-       TensorView --> Matrix
-       
-       style Memory fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style BufferView fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
-       style Descriptor fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
-       style TensorView fill:#fce7f3,stroke:#ec4899,stroke-width:2px
-       style Matrix fill:#e9d5ff,stroke:#9333ea,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Memory Foundation"
+                 Memory["Flat Memory Array<br/>0 1 2 3 4 5 6 7 8 9 10 11"]
+             end
+   
+             subgraph "Access Layer"
+                 BufferView["BufferView<br/>Linear Memory Access"]
+                 Descriptor["TensorDescriptor<br/>Shape & Stride Info"]
+             end
+   
+             subgraph "Tensor Layer"
+                 TensorView["TensorView<br/>Multi-dimensional Access"]
+             end
+   
+             subgraph "Logical View"
+                 Matrix["2D Matrix View<br/>[3×4]<br/>[[0,1,2,3]<br/>[4,5,6,7]<br/>[8,9,10,11]]"]
+             end
+   
+             Memory --> BufferView
+             Memory --> Descriptor
+             BufferView --> TensorView
+             Descriptor --> TensorView
+             TensorView --> Matrix
+   
+             style Memory fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style BufferView fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
+             style Descriptor fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
+             style TensorView fill:#fce7f3,stroke:#ec4899,stroke-width:2px
+             style Matrix fill:#e9d5ff,stroke:#9333ea,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tensor_views_1.svg
+   :alt: Diagram
+   :align: center
 The Foundation: BufferView and TensorDescriptor
 ------------------------------------------------

@@ -109,68 +121,92 @@ Coordinate-Based Access

 The fundamental operation of TensorView is translating multi-dimensional coordinates into memory accesses. This translation happens through a advanced pipeline that maintains efficiency while providing flexibility:

-.. raw:: html
-
-   <div class="mermaid">
-   flowchart LR
-       subgraph "User Input"
-           Coord["Coordinate<br/>(1, 2)"]
-       end
-       
-       subgraph "TensorView Processing"
-           Shape["Shape Check<br/>row < 3?<br/>col < 4?"]
-           Stride["Apply Strides<br/>offset = 1×4 + 2×1"]
-           Buffer["BufferView Access<br/>buffer[6]"]
-       end
-       
-       subgraph "Result"
-           Value["Value: 6"]
-       end
-       
-       Coord --> Shape
-       Shape -->|Valid| Stride
-       Stride --> Buffer
-       Buffer --> Value
-       
-       style Coord fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
-       style Shape fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-       style Stride fill:#dcfce7,stroke:#10b981,stroke-width:2px
-       style Buffer fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
-       style Value fill:#d1fae5,stroke:#10b981,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart LR
+             subgraph "User Input"
+                 Coord["Coordinate<br/>(1, 2)"]
+             end
+   
+             subgraph "TensorView Processing"
+                 Shape["Shape Check<br/>row < 3?<br/>col < 4?"]
+                 Stride["Apply Strides<br/>offset = 1×4 + 2×1"]
+                 Buffer["BufferView Access<br/>buffer[6]"]
+             end
+   
+             subgraph "Result"
+                 Value["Value: 6"]
+             end
+   
+             Coord --> Shape
+             Shape -->|Valid| Stride
+             Stride --> Buffer
+             Buffer --> Value
+   
+             style Coord fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style Shape fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+             style Stride fill:#dcfce7,stroke:#10b981,stroke-width:2px
+             style Buffer fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
+             style Value fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tensor_views_2.svg
+   :alt: Diagram
+   :align: center
 Memory Layouts and Strides
 --------------------------

 One of the most key features of TensorView is its ability to represent different memory layouts through stride manipulation. This capability enables zero-copy transformations that would otherwise require expensive memory operations:

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Row-Major Layout (C-style)"
-           RM["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (3,4)<br/>Strides: (4,1)"]
-           RMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
-           RM --> RMMatrix
-       end
-       
-       subgraph "Column-Major Layout (Fortran-style)"
-           CM["Memory: [0,3,6,9,1,4,7,10,2,5,8,11]<br/>Shape: (3,4)<br/>Strides: (1,3)"]
-           CMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
-           CM --> CMMatrix
-       end
-       
-       subgraph "Custom Stride (Transposed View)"
-           TV["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (4,3)<br/>Strides: (1,4)"]
-           TVMatrix["[[0, 4, 8]<br/> [1, 5, 9]<br/> [2, 6, 10]<br/> [3, 7, 11]]"]
-           TV --> TVMatrix
-       end
-       
-       style RM fill:#e0f2fe,stroke:#0284c7,stroke-width:2px
-       style CM fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-       style TV fill:#f3e8ff,stroke:#9333ea,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Row-Major Layout (C-style)"
+                 RM["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (3,4)<br/>Strides: (4,1)"]
+                 RMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
+                 RM --> RMMatrix
+             end
+   
+             subgraph "Column-Major Layout (Fortran-style)"
+                 CM["Memory: [0,3,6,9,1,4,7,10,2,5,8,11]<br/>Shape: (3,4)<br/>Strides: (1,3)"]
+                 CMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
+                 CM --> CMMatrix
+             end
+   
+             subgraph "Custom Stride (Transposed View)"
+                 TV["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (4,3)<br/>Strides: (1,4)"]
+                 TVMatrix["[[0, 4, 8]<br/> [1, 5, 9]<br/> [2, 6, 10]<br/> [3, 7, 11]]"]
+                 TV --> TVMatrix
+             end
+   
+             style RM fill:#e0f2fe,stroke:#0284c7,stroke-width:2px
+             style CM fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+             style TV fill:#f3e8ff,stroke:#9333ea,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tensor_views_3.svg
+   :alt: Diagram
+   :align: center
 Row-Major vs Column-Major Layouts
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -287,33 +323,45 @@ Memory Access Patterns

 The efficiency of TensorView operations depends critically on memory access patterns. Understanding these patterns is essential for achieving optimal performance (see :ref:`ck_tile_gpu_basics` for hardware considerations):

-.. raw:: html
-
-   <div class="mermaid" style="margin: 0 auto; display: block; width:70%;">
-   graph LR
-       subgraph "Memory Access Patterns"
-           Seq["Sequential Access<br/>(Good cache usage)"]
-           Stride["Strided Access<br/>(May cause cache misses)"]
-           Random["Random Access<br/>(Poor cache usage)"]
-       end
-       
-       subgraph "Optimization Strategies"
-           Opt1["Use row-major for row iteration"]
-           Opt2["Use col-major for column iteration"]
-           Opt3["Minimize stride between accesses"]
-           Opt4["Vectorize when possible"]
-       end
-       
-       Seq --> Opt1
-       Stride --> Opt2
-       Stride --> Opt3
-       Random --> Opt4
-       
-       style Seq fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style Stride fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-       style Random fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Memory Access Patterns"
+                 Seq["Sequential Access<br/>(Good cache usage)"]
+                 Stride["Strided Access<br/>(May cause cache misses)"]
+                 Random["Random Access<br/>(Poor cache usage)"]
+             end
+   
+             subgraph "Optimization Strategies"
+                 Opt1["Use row-major for row iteration"]
+                 Opt2["Use col-major for column iteration"]
+                 Opt3["Minimize stride between accesses"]
+                 Opt4["Vectorize when possible"]
+             end
+   
+             Seq --> Opt1
+             Stride --> Opt2
+             Stride --> Opt3
+             Random --> Opt4
+   
+             style Seq fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style Stride fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+             style Random fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tensor_views_4.svg
+   :alt: Diagram
+   :align: center
 Compile-Time Optimization
 ~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -339,36 +387,48 @@ TensorView vs BufferView

 Understanding when to use TensorView versus BufferView is crucial for writing efficient code:

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "BufferView"
-           BV1["Linear indexing only"]
-           BV2["buffer[5]"]
-           BV3["No shape information"]
-           BV4["Direct memory access"]
-       end
-       
-       subgraph "TensorView"
-           TV1["Multi-dimensional indexing"]
-           TV2["tensor(1, 2)"]
-           TV3["Shape-aware operations"]
-           TV4["Coordinate transformations"]
-       end
-       
-       subgraph "Use Cases"
-           UC1["BufferView: Low-level memory ops"]
-           UC2["TensorView: Matrix/tensor algorithms"]
-       end
-       
-       BV1 --> UC1
-       TV1 --> UC2
-       
-       style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
-       style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "BufferView"
+                 BV1["Linear indexing only"]
+                 BV2["buffer[5]"]
+                 BV3["No shape information"]
+                 BV4["Direct memory access"]
+             end
+   
+             subgraph "TensorView"
+                 TV1["Multi-dimensional indexing"]
+                 TV2["tensor(1, 2)"]
+                 TV3["Shape-aware operations"]
+                 TV4["Coordinate transformations"]
+             end
+   
+             subgraph "Use Cases"
+                 UC1["BufferView: Low-level memory ops"]
+                 UC2["TensorView: Matrix/tensor algorithms"]
+             end
+   
+             BV1 --> UC1
+             TV1 --> UC2
+   
+             style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
+             style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tensor_views_5.svg
+   :alt: Diagram
+   :align: center
 BufferView excels at raw memory operations where linear access is natural or where the overhead of coordinate calculation would be prohibitive. TensorView shines when algorithms naturally think in terms of multi-dimensional coordinates, such as matrix operations, image processing, or tensor contractions.

 Integration with Tile Distribution
--- a/docs/conceptual/ck_tile/thread_mapping.rst
+++ b/docs/conceptual/ck_tile/thread_mapping.rst
@@ -82,53 +82,61 @@ Composable Kernel abstracts thread identification into partition indices, buildi
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-       graph TB
-           subgraph "GPU Device"
-               subgraph "Thread Block"
-                   subgraph "Warp 0"
-                       T0["Thread 0<br/>lane_id=0"]
-                       T1["Thread 1<br/>lane_id=1"]
-                       T2["..."]
-                       T31["Thread 31<br/>lane_id=31"]
-                   end
-                   
-                   subgraph "Warp 1"
-                       T32["Thread 32<br/>lane_id=0"]
-                       T33["Thread 33<br/>lane_id=1"]
-                       T34["..."]
-                       T63["Thread 63<br/>lane_id=31"]
-                   end
-                   
-                   W2["Warp 2"]
-                   W3["..."]
-                   W7["Warp 7"]
-               end
-           end
-           
-           subgraph "Thread Identification"
-               TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"]
-               WID["Warp ID = threadIdx.x / 32"]
-               LID["Lane ID = threadIdx.x % 32"]
-           end
-           
-           subgraph "P-space Mapping"
-               P["P-coordinates<br/>NDimP=1: [thread_id]<br/>NDimP=2: [warp_id, lane_id]"]
-           end
-           
-           T0 --> TID
-           TID --> WID
-           TID --> LID
-           WID --> P
-           LID --> P
-           
-           style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-           style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-           style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+      .. mermaid::
+      
+          graph TB
+              subgraph "GPU Device"
+                  subgraph "Thread Block"
+                      subgraph "Warp 0"
+                          T0["Thread 0<br/>lane_id=0"]
+                          T1["Thread 1<br/>lane_id=1"]
+                          T2["..."]
+                          T31["Thread 31<br/>lane_id=31"]
+                      end
+                      
+                      subgraph "Warp 1"
+                          T32["Thread 32<br/>lane_id=0"]
+                          T33["Thread 33<br/>lane_id=1"]
+                          T34["..."]
+                          T63["Thread 63<br/>lane_id=31"]
+                      end
+                      
+                      W2["Warp 2"]
+                      W3["..."]
+                      W7["Warp 7"]
+                  end
+              end
+              
+              subgraph "Thread Identification"
+                  TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"]
+                  WID["Warp ID = threadIdx.x / 32"]
+                  LID["Lane ID = threadIdx.x % 32"]
+              end
+              
+              subgraph "P-space Mapping"
+                  P["P-coordinates<br/>NDimP=1: [thread_id]<br/>NDimP=2: [warp_id, lane_id]"]
+              end
+              
+              T0 --> TID
+              TID --> WID
+              TID --> LID
+              WID --> P
+              LID --> P
+              
+              style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+              style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+              style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+      
+      
   
   

+.. image:: diagrams/thread_mapping_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/thread_mapping_1.svg
   :alt: Diagram
   :align: center
@@ -179,43 +187,51 @@ Once threads know their IDs, they need to map those IDs to specific data element
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-       graph TB
-           subgraph "Thread to Data Mapping"
-               subgraph "Thread Grid"
-                   T00["Thread[0,0]<br/>Warp 0"]
-                   T01["Thread[0,1]<br/>Warp 0"]
-                   T10["Thread[1,0]<br/>Warp 1"]
-                   T11["Thread[1,1]<br/>Warp 1"]
-               end
-               
-               subgraph "Data Tiles"
-                   D00["Data[0:4, 0:4]<br/>16 elements"]
-                   D01["Data[0:4, 4:8]<br/>16 elements"]
-                   D10["Data[4:8, 0:4]<br/>16 elements"]
-                   D11["Data[4:8, 4:8]<br/>16 elements"]
-               end
-               
-               subgraph "Memory Access"
-                   MA["Coalesced Access<br/>Adjacent threads → Adjacent memory"]
-               end
-           end
-           
-           T00 --> D00
-           T01 --> D01
-           T10 --> D10
-           T11 --> D11
-           
-           D00 --> MA
-           D01 --> MA
-           
-           style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-           style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-           style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      .. mermaid::
+      
+          graph TB
+              subgraph "Thread to Data Mapping"
+                  subgraph "Thread Grid"
+                      T00["Thread[0,0]<br/>Warp 0"]
+                      T01["Thread[0,1]<br/>Warp 0"]
+                      T10["Thread[1,0]<br/>Warp 1"]
+                      T11["Thread[1,1]<br/>Warp 1"]
+                  end
+                  
+                  subgraph "Data Tiles"
+                      D00["Data[0:4, 0:4]<br/>16 elements"]
+                      D01["Data[0:4, 4:8]<br/>16 elements"]
+                      D10["Data[4:8, 0:4]<br/>16 elements"]
+                      D11["Data[4:8, 4:8]<br/>16 elements"]
+                  end
+                  
+                  subgraph "Memory Access"
+                      MA["Coalesced Access<br/>Adjacent threads → Adjacent memory"]
+                  end
+              end
+              
+              T00 --> D00
+              T01 --> D01
+              T10 --> D10
+              T11 --> D11
+              
+              D00 --> MA
+              D01 --> MA
+              
+              style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+              style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+              style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/thread_mapping_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/thread_mapping_2.svg
   :alt: Diagram
   :align: center
--- a/docs/conceptual/ck_tile/tile_distribution.rst
+++ b/docs/conceptual/ck_tile/tile_distribution.rst
@@ -19,79 +19,103 @@ The elegance of this design lies in its ability to adapt to diverse computationa
 Complete Tile Distribution System Overview
 ------------------------------------------

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Logical View"
-           T["Tensor<br/>Multi-dimensional data"]
-           TD["TileDistribution<br/>Work assignment"]
-           TW["TileWindow<br/>Data view"]
-       end
-       
-       subgraph "Coordinate Spaces"
-           X["X: Physical tensor coords"]
-           Y["Y: Tile pattern coords"]
-           P["P: Processing element coords"]
-           R["R: Replication coords (optional)"]
-       end
-       
-       subgraph "GPU Execution"
-           W["Warps<br/>32 threads each"]
-           L["Lanes<br/>Thread within warp"]
-           REG["Registers<br/>Thread-local storage"]
-       end
-       
-       T --> TD
-       TD --> TW
-       
-       TD --> X
-       TD --> Y
-       TD --> P
-       TD --> R
-       
-       P --> W
-       P --> L
-       TW --> REG
-       
-       style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-       style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Logical View"
+                 T["Tensor<br/>Multi-dimensional data"]
+                 TD["TileDistribution<br/>Work assignment"]
+                 TW["TileWindow<br/>Data view"]
+             end
+   
+             subgraph "Coordinate Spaces"
+                 X["X: Physical tensor coords"]
+                 Y["Y: Tile pattern coords"]
+                 P["P: Processing element coords"]
+                 R["R: Replication coords (optional)"]
+             end
+   
+             subgraph "GPU Execution"
+                 W["Warps<br/>32 threads each"]
+                 L["Lanes<br/>Thread within warp"]
+                 REG["Registers<br/>Thread-local storage"]
+             end
+   
+             T --> TD
+             TD --> TW
+   
+             TD --> X
+             TD --> Y
+             TD --> P
+             TD --> R
+   
+             P --> W
+             P --> L
+             TW --> REG
+   
+             style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tile_distribution_1.svg
+   :alt: Diagram
+   :align: center
 Coordinate System Architecture
 ------------------------------

-.. raw:: html
-
-   <div class="mermaid">
-   flowchart LR
-       subgraph "Input"
-           TC["Thread Coordinates<br/>(warpId, laneId)"]
-       end
-       
-       subgraph "Transformation Pipeline"
-           P2Y["P → Y<br/>Thread to pattern"]
-           Y2X["Y → X<br/>Pattern to physical"]
-           Y2D["Y → D<br/>Pattern to register"]
-       end
-       
-       subgraph "Output"
-           MC["Memory Coordinates<br/>Global addresses"]
-           RI["Register Indices<br/>Local storage"]
-       end
-       
-       TC --> P2Y
-       P2Y --> Y2X
-       P2Y --> Y2D
-       Y2X --> MC
-       Y2D --> RI
-       
-       style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
-       style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart LR
+             subgraph "Input"
+                 TC["Thread Coordinates<br/>(warpId, laneId)"]
+             end
+   
+             subgraph "Transformation Pipeline"
+                 P2Y["P → Y<br/>Thread to pattern"]
+                 Y2X["Y → X<br/>Pattern to physical"]
+                 Y2D["Y → D<br/>Pattern to register"]
+             end
+   
+             subgraph "Output"
+                 MC["Memory Coordinates<br/>Global addresses"]
+                 RI["Register Indices<br/>Local storage"]
+             end
+   
+             TC --> P2Y
+             P2Y --> Y2X
+             P2Y --> Y2D
+             Y2X --> MC
+             Y2D --> RI
+   
+             style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tile_distribution_2.svg
+   :alt: Diagram
+   :align: center
 What is Tile Distribution?
 --------------------------

@@ -132,41 +156,53 @@ The key insight that makes TileDistribution effective is its ability to abstract
 Problem Space Mapping
 ---------------------

-.. raw:: html
-
-   <div class="mermaid" style="margin: 0 auto; display: block; width: 60%;">
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+   
+         graph TB
+             subgraph "Problem Space (256×256 Matrix)"
+                 M["Full Matrix<br/>65,536 elements"]
+                 T1["Tile 1<br/>32×32"]
+                 T2["Tile 2<br/>32×32"]
+                 TN["Tile N<br/>32×32"]
+             end
+   
+             subgraph "Thread Assignment"
+                 W0["Warp 0<br/>32 threads"]
+                 W1["Warp 1<br/>32 threads"]
+                 L0["Lane 0-31<br/>Individual threads"]
+             end
+   
+             subgraph "Memory Pattern"
+                 MP["Coalesced Access<br/>Sequential addresses<br/>No bank conflicts"]
+             end
+   
+             M --> T1
+             M --> T2
+             M --> TN
+   
+             T1 --> W0
+             T1 --> W1
+             W0 --> L0
+             L0 --> MP
+   
+             style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
+   
+   
   
-   graph TB
-       subgraph "Problem Space (256×256 Matrix)"
-           M["Full Matrix<br/>65,536 elements"]
-           T1["Tile 1<br/>32×32"]
-           T2["Tile 2<br/>32×32"]
-           TN["Tile N<br/>32×32"]
-       end
-       
-       subgraph "Thread Assignment"
-           W0["Warp 0<br/>32 threads"]
-           W1["Warp 1<br/>32 threads"]
-           L0["Lane 0-31<br/>Individual threads"]
-       end
-       
-       subgraph "Memory Pattern"
-           MP["Coalesced Access<br/>Sequential addresses<br/>No bank conflicts"]
-       end
-       
-       M --> T1
-       M --> T2
-       M --> TN
-       
-       T1 --> W0
-       T1 --> W1
-       W0 --> L0
-       L0 --> MP
-       
-       style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px
-   </div>

+.. image:: diagrams/tile_distribution_3.svg
+   :alt: Diagram
+   :align: center
 Creating a TileDistribution
 ---------------------------

@@ -339,38 +375,50 @@ Let's see how to create and use a TileDistribution in practice:
 Hierarchical Decomposition
 --------------------------

-.. raw:: html
-
-   <div class="mermaid" style="margin: 0 auto; display: block; width: 45%;">
-   graph TB
-       subgraph "Level 1: Block Distribution"
-           B["Thread Block<br/>256 threads"]
-           BT1["Block Tile 1<br/>64×64"]
-           BT2["Block Tile 2<br/>64×64"]
-       end
-       
-       subgraph "Level 2: Warp Distribution"
-           W["Warp<br/>32 threads"]
-           WT1["Warp Tile 1<br/>16×16"]
-           WT2["Warp Tile 2<br/>16×16"]
-       end
-       
-       subgraph "Level 3: Thread Distribution"
-           T["Thread"]
-           TT["Thread Tile<br/>2×2"]
-       end
-       
-       B --> BT1
-       BT1 --> W
-       W --> WT1
-       WT1 --> T
-       T --> TT
-       
-       style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-       style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-       style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Level 1: Block Distribution"
+                 B["Thread Block<br/>256 threads"]
+                 BT1["Block Tile 1<br/>64×64"]
+                 BT2["Block Tile 2<br/>64×64"]
+             end
+   
+             subgraph "Level 2: Warp Distribution"
+                 W["Warp<br/>32 threads"]
+                 WT1["Warp Tile 1<br/>16×16"]
+                 WT2["Warp Tile 2<br/>16×16"]
+             end
+   
+             subgraph "Level 3: Thread Distribution"
+                 T["Thread"]
+                 TT["Thread Tile<br/>2×2"]
+             end
+   
+             B --> BT1
+             BT1 --> W
+             W --> WT1
+             WT1 --> T
+             T --> TT
+   
+             style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+             style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tile_distribution_4.svg
+   :alt: Diagram
+   :align: center
 Advanced Example: Matrix Multiplication Distribution
 ----------------------------------------------------

@@ -422,36 +470,48 @@ Advanced Example: Matrix Multiplication Distribution
 Work Distribution Pattern
 -------------------------

-.. raw:: html
-
-   <div class="mermaid" style="margin: 0 auto; display: block; width: 30%;">
-   flowchart TB
-       subgraph "Matrix C (128×128)"
-           C["16,384 elements"]
-       end
-       
-       subgraph "Thread Grid (32×32)"
-           TG["1,024 threads"]
-       end
-       
-       subgraph "Per Thread"
-           PT["4×4 tile<br/>16 elements"]
-       end
-       
-       subgraph "Memory Access"
-           MA["Coalesced reads<br/>Efficient writes<br/>No conflicts"]
-       end
-       
-       C --> TG
-       TG --> PT
-       PT --> MA
-       
-       style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-       style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-       style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         flowchart TB
+             subgraph "Matrix C (128×128)"
+                 C["16,384 elements"]
+             end
+   
+             subgraph "Thread Grid (32×32)"
+                 TG["1,024 threads"]
+             end
+   
+             subgraph "Per Thread"
+                 PT["4×4 tile<br/>16 elements"]
+             end
+   
+             subgraph "Memory Access"
+                 MA["Coalesced reads<br/>Efficient writes<br/>No conflicts"]
+             end
+   
+             C --> TG
+             TG --> PT
+             PT --> MA
+   
+             style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+             style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+             style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tile_distribution_5.svg
+   :alt: Diagram
+   :align: center
 Memory Access Patterns
 ----------------------

@@ -464,79 +524,103 @@ One of the key benefits of TileDistribution is generating optimal memory access
 Transformation Pipeline
 -----------------------

-.. raw:: html
-
-   <div class="mermaid" style="margin: 0 auto; display: block; width: 100%;">
-   graph LR
-       subgraph "Input"
-           TID["Thread ID<br/>(0-1023)"]
-       end
-       
-       subgraph "Stage 1"
-           P["P-coordinates<br/>(warp, lane)"]
-       end
-       
-       subgraph "Stage 2"
-           Y["Y-coordinates<br/>(tile position)"]
-       end
-       
-       subgraph "Stage 3"
-           X["X-coordinates<br/>(tensor indices)"]
-       end
-       
-       subgraph "Output"
-           ADDR["Memory addresses<br/>Register indices"]
-       end
-       
-       TID --> P
-       P --> Y
-       Y --> X
-       X --> ADDR
-       
-       style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
-       style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph LR
+             subgraph "Input"
+                 TID["Thread ID<br/>(0-1023)"]
+             end
+   
+             subgraph "Stage 1"
+                 P["P-coordinates<br/>(warp, lane)"]
+             end
+   
+             subgraph "Stage 2"
+                 Y["Y-coordinates<br/>(tile position)"]
+             end
+   
+             subgraph "Stage 3"
+                 X["X-coordinates<br/>(tensor indices)"]
+             end
+   
+             subgraph "Output"
+                 ADDR["Memory addresses<br/>Register indices"]
+             end
+   
+             TID --> P
+             P --> Y
+             Y --> X
+             X --> ADDR
+   
+             style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+             style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tile_distribution_6.svg
+   :alt: Diagram
+   :align: center
 Performance Comparison
 ----------------------

-.. raw:: html
-
-   <div class="mermaid">
-   graph TB
-       subgraph "Manual Implementation"
-           M1["Calculate indices manually"]
-           M2["Handle boundary conditions"]
-           M3["Ensure coalescing"]
-           M4["Manage bank conflicts"]
-           M5["~200 lines of code"]
-       end
-       
-       subgraph "With TileDistribution"
-           T1["make_tile_distribution()"]
-           T2["Automatic optimization"]
-           T3["~10 lines of code"]
-       end
-       
-       subgraph "Performance"
-           P1["Same performance"]
-           P2["Fewer bugs"]
-           P3["Portable across GPUs"]
-       end
-       
-       M1 --> M5
-       T1 --> T3
-       
-       M5 --> P1
-       T3 --> P1
-       P1 --> P2
-       P2 --> P3
-       
-       style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
-       style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
-       style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-   </div>
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
+   
+      .. mermaid::
+      
+         graph TB
+             subgraph "Manual Implementation"
+                 M1["Calculate indices manually"]
+                 M2["Handle boundary conditions"]
+                 M3["Ensure coalescing"]
+                 M4["Manage bank conflicts"]
+                 M5["~200 lines of code"]
+             end
+   
+             subgraph "With TileDistribution"
+                 T1["make_tile_distribution()"]
+                 T2["Automatic optimization"]
+                 T3["~10 lines of code"]
+             end
+   
+             subgraph "Performance"
+                 P1["Same performance"]
+                 P2["Fewer bugs"]
+                 P3["Portable across GPUs"]
+             end
+   
+             M1 --> M5
+             T1 --> T3
+   
+             M5 --> P1
+             T3 --> P1
+             P1 --> P2
+             P2 --> P3
+   
+             style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+             style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+             style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+      
+      
+   
+   
+   

+.. image:: diagrams/tile_distribution_7.svg
+   :alt: Diagram
+   :align: center
 Summary
 -------

--- a/docs/conceptual/ck_tile/transforms.rst
+++ b/docs/conceptual/ck_tile/transforms.rst
@@ -32,28 +32,36 @@ Zero-Copy Logical Operations
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "Tensor Coordinate Transformation"
-              US["Lower Dimension Space<br/>Source coordinate system"]
-              LS["Upper Dimension Space<br/>Target coordinate system"]
-              
-              DATA["Linear Data in Memory<br/>Layout determined by tensor<br/>shape & strides"]
-          end
-          
-          US -->|"Forward Transform"| LS
-          LS -->|"Inverse Transform"| US
-          
-          DATA -.->|"Same data,<br/>different views"| US
-          DATA -.->|"Same data,<br/>different views"| LS
-          
-          style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "Tensor Coordinate Transformation"
+                 US["Lower Dimension Space<br/>Source coordinate system"]
+                 LS["Upper Dimension Space<br/>Target coordinate system"]
+                 
+                 DATA["Linear Data in Memory<br/>Layout determined by tensor<br/>shape & strides"]
+             end
+             
+             US -->|"Forward Transform"| LS
+             LS -->|"Inverse Transform"| US
+             
+             DATA -.->|"Same data,<br/>different views"| US
+             DATA -.->|"Same data,<br/>different views"| LS
+             
+             style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_1.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_1.svg
   :alt: Diagram
   :align: center
@@ -74,38 +82,46 @@ Transform System Architecture
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          
-          subgraph "Transform Types"
-              EMB["EmbedTransform<br/>Linear → Multi-D Strided"]
-              UNM["MergeTransform<br/>Multi-D → Linear"]
-              MRG["UnmergeTransform<br/>Linear → Multi-D"]
-              REP["ReplicateTransform<br/>0D → Multi-D Broadcast"]
-              OFF["OffsetTransform<br/>Translation"]
-              PAS["PassThroughTransform<br/>Identity"]
-              PAD["PadTransform<br/>Boundaries"]
-          end
-          
-          subgraph "Operations"
-              FWD["Forward<br/>calculate_lower_index()"]
-              BWD["Backward<br/>calculate_upper_index()"]
-              UPD["Update<br/>update_lower_index()"]
-          end
-          
-          EMB --> FWD
-          UNM --> FWD
-          MRG --> FWD
-          REP --> FWD
-          OFF --> FWD
-          PAS --> FWD
-          PAD --> FWD
-          
-          style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      .. mermaid::
+      
+         graph TB
+             
+             subgraph "Transform Types"
+                 EMB["EmbedTransform<br/>Linear → Multi-D Strided"]
+                 UNM["MergeTransform<br/>Multi-D → Linear"]
+                 MRG["UnmergeTransform<br/>Linear → Multi-D"]
+                 REP["ReplicateTransform<br/>0D → Multi-D Broadcast"]
+                 OFF["OffsetTransform<br/>Translation"]
+                 PAS["PassThroughTransform<br/>Identity"]
+                 PAD["PadTransform<br/>Boundaries"]
+             end
+             
+             subgraph "Operations"
+                 FWD["Forward<br/>calculate_lower_index()"]
+                 BWD["Backward<br/>calculate_upper_index()"]
+                 UPD["Update<br/>update_lower_index()"]
+             end
+             
+             EMB --> FWD
+             UNM --> FWD
+             MRG --> FWD
+             REP --> FWD
+             OFF --> FWD
+             PAS --> FWD
+             PAD --> FWD
+             
+             style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+      
+      
   
   

+.. image:: diagrams/transforms_2.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_2.svg
   :alt: Diagram
   :align: center
@@ -117,28 +133,36 @@ MergeTransform collapses multiple dimensions from the lower coordinate space int
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "MergeTransform: Multi-D → Linear"
-              LS["Lower Coordinate Space<br/>2D: [4, 5]<br/>Coord: (2, 3)"]
-              US["Upper Coordinate Space<br/>1D Linear<br/>Index: 13"]
-              
-              DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 20 elements"]
-          end
-          
-          LS -->|"Forward Transform<br/>2×5 + 3 = 13"| US
-          US -->|"Inverse Transform<br/>13÷5=2, 13%5=3"| LS
-          
-          DATA -.->|"Multi-dimensional<br/>view"| LS
-          DATA -.->|"Linear<br/>view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "MergeTransform: Multi-D → Linear"
+                 LS["Lower Coordinate Space<br/>2D: [4, 5]<br/>Coord: (2, 3)"]
+                 US["Upper Coordinate Space<br/>1D Linear<br/>Index: 13"]
+                 
+                 DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 20 elements"]
+             end
+             
+             LS -->|"Forward Transform<br/>2×5 + 3 = 13"| US
+             US -->|"Inverse Transform<br/>13÷5=2, 13%5=3"| LS
+             
+             DATA -.->|"Multi-dimensional<br/>view"| LS
+             DATA -.->|"Linear<br/>view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_3.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_3.svg
   :alt: Diagram
   :align: center
@@ -180,28 +204,36 @@ UnmergeTransform expands coordinates from a single dimension in the lower coordi
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "UnmergeTransform: Linear → Multi-D"
-              LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
-              US["Upper Coordinate Space<br/>3D: [3, 4, 2]<br/>Coord: (1, 3, 0)"]
-              
-              DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 24 elements"]
-          end
-          
-          LS -->|"Forward Transform<br/>14 = 1×8 + 3×2 + 0"| US
-          US -->|"Inverse Transform<br/>linearize back"| LS
-          
-          DATA -.->|"Linear<br/>view"| LS
-          DATA -.->|"Multi-dimensional<br/>view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "UnmergeTransform: Linear → Multi-D"
+                 LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
+                 US["Upper Coordinate Space<br/>3D: [3, 4, 2]<br/>Coord: (1, 3, 0)"]
+                 
+                 DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 24 elements"]
+             end
+             
+             LS -->|"Forward Transform<br/>14 = 1×8 + 3×2 + 0"| US
+             US -->|"Inverse Transform<br/>linearize back"| LS
+             
+             DATA -.->|"Linear<br/>view"| LS
+             DATA -.->|"Multi-dimensional<br/>view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_4.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_4.svg
   :alt: Diagram
   :align: center
@@ -253,28 +285,36 @@ EmbedTransform expands linear indices from the lower coordinate space into multi
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "EmbedTransform: Linear → Multi-D Strided"
-              LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
-              US["Upper Coordinate Space<br/>2D: [2, 3]<br/>Coord: (1, 2)"]
-              
-              DATA["Linear Buffer in Memory"]
-          end
-          
-          LS -->|"Forward Transform <br/>Strides: [12, 1] <br/>14 ÷ 12 = 1, 14 % 12 = 2"| US
-          US -->|"Inverse Transform<br/>1×12 + 2×1 = 14"| LS
-          
-          DATA -.->|"Linear<br/>index view"| LS
-          DATA -.->|"Multi-dimensional<br/>strided view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "EmbedTransform: Linear → Multi-D Strided"
+                 LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
+                 US["Upper Coordinate Space<br/>2D: [2, 3]<br/>Coord: (1, 2)"]
+                 
+                 DATA["Linear Buffer in Memory"]
+             end
+             
+             LS -->|"Forward Transform <br/>Strides: [12, 1] <br/>14 ÷ 12 = 1, 14 % 12 = 2"| US
+             US -->|"Inverse Transform<br/>1×12 + 2×1 = 14"| LS
+             
+             DATA -.->|"Linear<br/>index view"| LS
+             DATA -.->|"Multi-dimensional<br/>strided view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_5.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_5.svg
   :alt: Diagram
   :align: center
@@ -315,28 +355,36 @@ ReplicateTransform creates a higher-dimensional tensor by replicating (broadcast
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "ReplicateTransform: 0D → Multi-D Broadcasting"
-              LS["Lower Coordinate Space<br/>0D: Scalar<br/>Empty coordinate []"]
-              US["Upper Coordinate Space<br/>2D: [3, 4]<br/>All coords: (i, j)"]
-              
-              DATA["Single Scalar Value"]
-          end
-          
-          LS -->|"Forward Transform<br/>[] → (i,j) for any i,j"| US
-          US -->|"Inverse Transform<br/>(i,j) → [] for any i,j"| LS
-          
-          DATA -.->|"One scalar<br/>value"| LS
-          DATA -.->|"Broadcasted view<br/>at all positions"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "ReplicateTransform: 0D → Multi-D Broadcasting"
+                 LS["Lower Coordinate Space<br/>0D: Scalar<br/>Empty coordinate []"]
+                 US["Upper Coordinate Space<br/>2D: [3, 4]<br/>All coords: (i, j)"]
+                 
+                 DATA["Single Scalar Value"]
+             end
+             
+             LS -->|"Forward Transform<br/>[] → (i,j) for any i,j"| US
+             US -->|"Inverse Transform<br/>(i,j) → [] for any i,j"| LS
+             
+             DATA -.->|"One scalar<br/>value"| LS
+             DATA -.->|"Broadcasted view<br/>at all positions"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_6.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_6.svg
   :alt: Diagram
   :align: center
@@ -388,28 +436,36 @@ OffsetTransform shifts coordinates by a fixed offset, creating a translated view
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "OffsetTransform: 1D → 1D Translation"
-              LS["Lower Coordinate Space<br/>1D: [0, 63]<br/>Coord: index + offset"]
-              US["Upper Coordinate Space<br/>1D: [0, 47]<br/>Coord: index"]
-              
-              DATA["Linear Buffer in Memory"]
-          end
-          
-          LS -->|"Forward Transform<br/>idx → idx + 16"| US
-          US -->|"Inverse Transform<br/>idx + 16 → idx"| LS
-          
-          DATA -.->|"Lower<br/>view"| LS
-          DATA -.->|"Upper<br/>view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "OffsetTransform: 1D → 1D Translation"
+                 LS["Lower Coordinate Space<br/>1D: [0, 63]<br/>Coord: index + offset"]
+                 US["Upper Coordinate Space<br/>1D: [0, 47]<br/>Coord: index"]
+                 
+                 DATA["Linear Buffer in Memory"]
+             end
+             
+             LS -->|"Forward Transform<br/>idx → idx + 16"| US
+             US -->|"Inverse Transform<br/>idx + 16 → idx"| LS
+             
+             DATA -.->|"Lower<br/>view"| LS
+             DATA -.->|"Upper<br/>view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_7.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_7.svg
   :alt: Diagram
   :align: center
@@ -462,28 +518,36 @@ No-op transform that passes coordinates unchanged. The PassThrough transform is
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "PassThroughTransform: 1D → 1D Identity"
-              LS["Lower Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
-              US["Upper Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
-              
-              DATA["Linear Buffer in Memory"]
-          end
-          
-          LS -.->|"Perfect Identity<br/>idx → idx"| US
-          US -.->|"Perfect Identity<br/>idx → idx"| LS
-          
-          DATA -->|"Same buffer<br/>same view"| LS
-          DATA -->|"Same buffer<br/>same view"| US
-          
-          style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
-          style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "PassThroughTransform: 1D → 1D Identity"
+                 LS["Lower Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
+                 US["Upper Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
+                 
+                 DATA["Linear Buffer in Memory"]
+             end
+             
+             LS -.->|"Perfect Identity<br/>idx → idx"| US
+             US -.->|"Perfect Identity<br/>idx → idx"| LS
+             
+             DATA -->|"Same buffer<br/>same view"| LS
+             DATA -->|"Same buffer<br/>same view"| US
+             
+             style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
+             style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_8.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_8.svg
   :alt: Diagram
   :align: center
@@ -531,28 +595,36 @@ PadTransform adds padding to tensor dimensions, mapping coordinates from upper d
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "PadTransform: 1D → 1D with Padding"
-              LS["Lower Coordinate Space<br/>1D: [0, 2] (original data)"]
-              US["Upper Coordinate Space<br/>1D: [0, 4] (with padding)"]
-              
-              DATA["Tensor Data in Memory"]
-          end
-          
-          LS -->|"Forward Transform<br/>idx + left_pad"| US
-          US -->|"Inverse Transform<br/>idx - left_pad"| LS
-          
-          DATA -.->|"Original view"| LS
-          DATA -.->|"Padded view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "PadTransform: 1D → 1D with Padding"
+                 LS["Lower Coordinate Space<br/>1D: [0, 2] (original data)"]
+                 US["Upper Coordinate Space<br/>1D: [0, 4] (with padding)"]
+                 
+                 DATA["Tensor Data in Memory"]
+             end
+             
+             LS -->|"Forward Transform<br/>idx + left_pad"| US
+             US -->|"Inverse Transform<br/>idx - left_pad"| LS
+             
+             DATA -.->|"Original view"| LS
+             DATA -.->|"Padded view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_9.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_9.svg
   :alt: Diagram
   :align: center
@@ -603,28 +675,36 @@ XorTransform applies a 2D XOR mapping for specialized memory access patterns. It
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "XorTransform: 2D → 2D XOR Mapping"
-              LS["Lower Coordinate Space<br/>2D: [4, 8]<br/>XOR-transformed coords"]
-              US["Upper Coordinate Space<br/>2D: [4, 8]<br/>Normal coords"]
-              
-              DATA["Same Tensor Data"]
-          end
-          
-          LS -->|"Forward Transform<br/>apply XOR reverse"| US
-          US -->|"Inverse Transform<br/>apply XOR mapping"| LS
-          
-          DATA -.->|"XOR pattern<br/>view"| LS
-          DATA -.->|"Normal<br/>view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "XorTransform: 2D → 2D XOR Mapping"
+                 LS["Lower Coordinate Space<br/>2D: [4, 8]<br/>XOR-transformed coords"]
+                 US["Upper Coordinate Space<br/>2D: [4, 8]<br/>Normal coords"]
+                 
+                 DATA["Same Tensor Data"]
+             end
+             
+             LS -->|"Forward Transform<br/>apply XOR reverse"| US
+             US -->|"Inverse Transform<br/>apply XOR mapping"| LS
+             
+             DATA -.->|"XOR pattern<br/>view"| LS
+             DATA -.->|"Normal<br/>view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_10.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_10.svg
   :alt: Diagram
   :align: center
@@ -636,28 +716,36 @@ SliceTransform extracts a sub-region from a tensor dimension.
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "SliceTransform: 1D → 1D Sub-region"
-              LS["Lower Coordinate Space<br/>1D: [0, 9] (original range)"]
-              US["Upper Coordinate Space<br/>1D: [0, 4] (slice range)"]
-              
-              DATA["Tensor Data in Memory"]
-          end
-          
-          LS -->|"Forward Transform<br/>idx + slice_begin"| US
-          US -->|"Inverse Transform<br/>idx - slice_begin"| LS
-          
-          DATA -.->|"Full tensor<br/>view"| LS
-          DATA -.->|"Sub-region<br/>view"| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "SliceTransform: 1D → 1D Sub-region"
+                 LS["Lower Coordinate Space<br/>1D: [0, 9] (original range)"]
+                 US["Upper Coordinate Space<br/>1D: [0, 4] (slice range)"]
+                 
+                 DATA["Tensor Data in Memory"]
+             end
+             
+             LS -->|"Forward Transform<br/>idx + slice_begin"| US
+             US -->|"Inverse Transform<br/>idx - slice_begin"| LS
+             
+             DATA -.->|"Full tensor<br/>view"| LS
+             DATA -.->|"Sub-region<br/>view"| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_11.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_11.svg
   :alt: Diagram
   :align: center
@@ -669,28 +757,36 @@ ModuloTransform applies cyclic wrapping to coordinates using modulo operations.
 .. 
   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-   .. mermaid::
+.. 
+   Original mermaid diagram (edit here, then run update_diagrams.py)
   
-      graph TB
-          subgraph "ModuloTransform: 1D → 1D Cyclic"
-              LS["Lower Coordinate Space<br/>1D: [0, 3] (modulus range)"]
-              US["Upper Coordinate Space<br/>1D: [0, 15] (full range)"]
-              
-              DATA["Tensor Data in Memory"]
-          end
-          
-          LS -->|"Forward Transform<br/>idx * cycle_count"| US
-          US -->|"Inverse Transform<br/>idx % modulus"| LS
-          
-          DATA -.->|" "| LS
-          DATA -.->|" "| US
-          
-          style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
-          style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
-          style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      .. mermaid::
+      
+         graph TB
+             subgraph "ModuloTransform: 1D → 1D Cyclic"
+                 LS["Lower Coordinate Space<br/>1D: [0, 3] (modulus range)"]
+                 US["Upper Coordinate Space<br/>1D: [0, 15] (full range)"]
+                 
+                 DATA["Tensor Data in Memory"]
+             end
+             
+             LS -->|"Forward Transform<br/>idx * cycle_count"| US
+             US -->|"Inverse Transform<br/>idx % modulus"| LS
+             
+             DATA -.->|" "| LS
+             DATA -.->|" "| US
+             
+             style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+             style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+             style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+      
+      
   
   

+.. image:: diagrams/transforms_12.svg
+   :alt: Diagram
+   :align: center
 .. image:: diagrams/transforms_12.svg
   :alt: Diagram
   :align: center