+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Non-Atomic Operation (Race Condition)"
+ NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
+ NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
+ NA2 --> NA5["Thread 1: Write 15"]
+ NA4 --> NA6["Thread 2: Write 13"]
+ NA5 & NA6 --> NA7["Final value: 13 ❌ (Lost update from Thread 1)"]
+ end
+
+ subgraph "Atomic Operation (Thread-Safe)"
+ A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures serialization"]
+ A3["Thread 2: atomic_add(3)"] --> A2
+ A2 --> A4["Final value: 18 ✓ (Both updates applied)"]
+ end
+
+ style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/buffer_views_4.svg
+ :alt: Diagram
+ :align: center
C++ Atomic Operations
~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/conceptual/ck_tile/convert_mermaid_to_svg.py b/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
index 3a09f54097..a6cdadd4e4 100644
--- a/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
+++ b/docs/conceptual/ck_tile/convert_mermaid_to_svg.py
@@ -31,11 +31,16 @@ RST_FILES = [
'descriptors.rst',
'coordinate_movement.rst',
'adaptors.rst',
+ 'introduction_motivation.rst',
+ 'buffer_views.rst',
+ 'tensor_views.rst',
+ 'coordinate_systems.rst',
+ 'tile_distribution.rst',
]
-# Pattern to find mermaid blocks
+# Pattern to find mermaid blocks (can be indented with 3 spaces for commented blocks)
MERMAID_PATTERN = re.compile(
- r'^\.\. mermaid::\s*\n((?:(?:\n| .*))*)',
+ r'^(?: )?\.\. mermaid::\s*\n((?:(?:\n| .*))*)',
re.MULTILINE
)
diff --git a/docs/conceptual/ck_tile/convert_raw_html_to_commented.py b/docs/conceptual/ck_tile/convert_raw_html_to_commented.py
new file mode 100644
index 0000000000..3cba55926d
--- /dev/null
+++ b/docs/conceptual/ck_tile/convert_raw_html_to_commented.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""Convert raw HTML mermaid blocks to commented format for SVG conversion."""
+
+import os
+import re
+
+def convert_raw_html_to_commented(content):
+ """Convert raw HTML mermaid blocks to commented mermaid format."""
+
+ # Pattern to match raw HTML mermaid blocks
+ pattern = r'\.\. raw:: html\n\n
]*>\n(.*?)\n
'
+
+ def replace_block(match):
+ mermaid_code = match.group(1)
+ # The mermaid code in HTML has 3-space indentation, keep it
+ # but add 3 more spaces for .. mermaid:: indentation
+ mermaid_lines = mermaid_code.split('\n')
+ properly_indented = []
+ for line in mermaid_lines:
+ if line.strip(): # Non-empty line
+ # Line already has 3 spaces from HTML, add 3 more for mermaid block
+ properly_indented.append(' ' + line)
+ else:
+ properly_indented.append('')
+
+ indented_code = '\n'.join(properly_indented)
+
+ # Create commented format matching the expected pattern
+ commented = f"""..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+{indented_code}
+
+
+"""
+ return commented
+
+ return re.sub(pattern, replace_block, content, flags=re.DOTALL)
+
+def main():
+ """Process files with raw HTML mermaid blocks."""
+
+ files_to_convert = [
+ 'introduction_motivation.rst',
+ 'buffer_views.rst',
+ 'tensor_views.rst',
+ 'coordinate_systems.rst',
+ 'tile_distribution.rst'
+ ]
+
+ converted_files = []
+
+ for filename in files_to_convert:
+ if not os.path.exists(filename):
+ print(f'Skipping {filename} - not found')
+ continue
+
+ with open(filename, 'r', encoding='utf-8') as f:
+ original = f.read()
+
+ converted = convert_raw_html_to_commented(original)
+
+ if converted != original:
+ with open(filename, 'w', encoding='utf-8') as f:
+ f.write(converted)
+
+ blocks_converted = original.count('.. raw:: html')
+ converted_files.append((filename, blocks_converted))
+ print(f'✓ Converted {filename}: {blocks_converted} blocks')
+ else:
+ print(f' {filename}: no raw HTML blocks found')
+
+ print(f'\n=== CONVERSION COMPLETE ===')
+ print(f'Files converted: {len(converted_files)}')
+ print(f'Total blocks: {sum(c for _, c in converted_files)}')
+ print('\nNext: Run convert_mermaid_to_svg.py to generate SVG files')
+
+if __name__ == '__main__':
+ main()
diff --git a/docs/conceptual/ck_tile/convolution_example.rst b/docs/conceptual/ck_tile/convolution_example.rst
index 200efcd74e..15c6bce874 100644
--- a/docs/conceptual/ck_tile/convolution_example.rst
+++ b/docs/conceptual/ck_tile/convolution_example.rst
@@ -18,42 +18,50 @@ The key insight is that convolution can be transformed from a complex nested loo
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "Convolution Process"
- I["Input Image 6×6"]
- K["Kernel 3×3"]
- SW["Sliding Window Extract 3×3 patches"]
- DP["Dot Product Element-wise multiply & sum"]
- O["Output 4×4"]
- end
-
- subgraph "Im2col Optimization"
- W["Windows Matrix 16×9 (all patches)"]
- KF["Kernel Flattened 9×1"]
- MM["Matrix Multiply W @ K"]
- OF["Output Flattened 16×1"]
- end
-
- I --> SW
- K --> DP
- SW --> DP
- DP --> O
-
- SW --> W
- K --> KF
- W --> MM
- KF --> MM
- MM --> OF
- OF --> O
-
- style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ .. mermaid::
+
+ graph TB
+ subgraph "Convolution Process"
+ I["Input Image 6×6"]
+ K["Kernel 3×3"]
+ SW["Sliding Window Extract 3×3 patches"]
+ DP["Dot Product Element-wise multiply & sum"]
+ O["Output 4×4"]
+ end
+
+ subgraph "Im2col Optimization"
+ W["Windows Matrix 16×9 (all patches)"]
+ KF["Kernel Flattened 9×1"]
+ MM["Matrix Multiply W @ K"]
+ OF["Output Flattened 16×1"]
+ end
+
+ I --> SW
+ K --> DP
+ SW --> DP
+ DP --> O
+
+ SW --> W
+ K --> KF
+ W --> MM
+ KF --> MM
+ MM --> OF
+ OF --> O
+
+ style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+
+
+.. image:: diagrams/convolution_example.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/coordinate_movement.rst b/docs/conceptual/ck_tile/coordinate_movement.rst
index 860d585a21..6915842139 100644
--- a/docs/conceptual/ck_tile/coordinate_movement.rst
+++ b/docs/conceptual/ck_tile/coordinate_movement.rst
@@ -20,36 +20,44 @@ For the mathematical foundations of coordinate systems, see :ref:`ck_tile_coordi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
-
- graph TB
- subgraph "Coordinate Movement System"
- TC["TensorCoordinate Position + Descriptor Context"]
- TAC["TensorAdaptorCoordinate Position + Transform Context"]
- MC["move_coordinate() Efficient Navigation"]
- end
-
- subgraph "Movement Example"
- S["Start: [1,1] Offset: 5"]
- M1["Move [0,1] → [1,2] Offset: 6"]
- M2["Move [1,0] → [2,2] Offset: 10"]
- M3["Move [1,1] → [3,3] Offset: 15"]
- end
-
- TC --> MC
- TAC --> MC
-
- S --> M1
- M1 --> M2
- M2 --> M3
-
- style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+ .. mermaid::
+
+ graph TB
+ subgraph "Coordinate Movement System"
+ TC["TensorCoordinate Position + Descriptor Context"]
+ TAC["TensorAdaptorCoordinate Position + Transform Context"]
+ MC["move_coordinate() Efficient Navigation"]
+ end
+
+ subgraph "Movement Example"
+ S["Start: [1,1] Offset: 5"]
+ M1["Move [0,1] → [1,2] Offset: 6"]
+ M2["Move [1,0] → [2,2] Offset: 10"]
+ M3["Move [1,1] → [3,3] Offset: 15"]
+ end
+
+ TC --> MC
+ TAC --> MC
+
+ S --> M1
+ M1 --> M2
+ M2 --> M3
+
+ style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+
+.. image:: diagrams/coordinate_movement.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/coordinate_movement.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/coordinate_systems.rst b/docs/conceptual/ck_tile/coordinate_systems.rst
index 551f766321..0763d83c46 100644
--- a/docs/conceptual/ck_tile/coordinate_systems.rst
+++ b/docs/conceptual/ck_tile/coordinate_systems.rst
@@ -15,39 +15,51 @@ The Five Coordinate Spaces
The CK framework employs five interconnected coordinate spaces, each serving a specific purpose in the journey from thread identification to memory access. These spaces work together to solve the fundamental challenge of GPU programming: efficiently distributing work across thousands of parallel threads while maintaining optimal memory access patterns.
-.. raw:: html
-
-
- graph TB
- subgraph "Coordinate Spaces Overview"
- P["P-space Thread Identification Which thread am I?"]
- Y["Y-space Logical Tile Which element in my tile?"]
- X["X-space Physical Tensor Where in the tensor?"]
- R["R-space Replication Data sharing pattern"]
- D["D-space Linear Storage Memory address"]
- end
-
- subgraph "Transformations"
- T1["P + Y → X Thread + Element → Position"]
- T2["X → D Position → Address"]
- end
-
- P --> T1
- Y --> T1
- T1 --> X
- X --> T2
- T2 --> D
-
- R -.-> P
- R -.-> Y
-
- style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
- style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Coordinate Spaces Overview"
+ P["P-space Thread Identification Which thread am I?"]
+ Y["Y-space Logical Tile Which element in my tile?"]
+ X["X-space Physical Tensor Where in the tensor?"]
+ R["R-space Replication Data sharing pattern"]
+ D["D-space Linear Storage Memory address"]
+ end
+
+ subgraph "Transformations"
+ T1["P + Y → X Thread + Element → Position"]
+ T2["X → D Position → Address"]
+ end
+
+ P --> T1
+ Y --> T1
+ T1 --> X
+ X --> T2
+ T2 --> D
+
+ R -.-> P
+ R -.-> Y
+
+ style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+ style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/coordinate_systems_1.svg
+ :alt: Diagram
+ :align: center
The Challenge and Solution
~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -69,41 +81,53 @@ P-space (Partition Space) represents the foundation of the coordinate system hie
GPU Thread Hierarchy
~~~~~~~~~~~~~~~~~~~~
-.. raw:: html
-
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "GPU Thread Hierarchy"
+ subgraph "Block"
+ subgraph "Warp 0"
+ T0["Thread 0 P=[0,0]"]
+ T1["Thread 1 P=[0,1]"]
+ T2["Thread 2 P=[0,2]"]
+ T31["..."]
+ T3["Thread 31 P=[0,31]"]
+ end
+ subgraph "Warp 1"
+ T32["Thread 32 P=[1,0]"]
+ T33["Thread 33 P=[1,1]"]
+ T34["..."]
+ T63["Thread 63 P=[1,31]"]
+ end
+ W2["Warp 2..."]
+ W7["Warp 7"]
+ end
+ end
+
+ subgraph "P-space Mapping"
+ PM["P-coordinates = [warp_id, lane_id] or P-coordinates = [block_x, block_y, thread_x, thread_y]"]
+ end
+
+ T0 --> PM
+ T32 --> PM
+
+ style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/coordinate_systems_2.svg
+ :alt: Diagram
+ :align: center
The structure of P-space directly reflects the :ref:`hardware organization ` of modern GPUs. Each thread receives a unique P-coordinate that encodes its position within the execution hierarchy. For simple distributions, P-space might be one-dimensional, containing only a thread ID. For complex hierarchical distributions, P-space can have multiple dimensions representing different levels of the GPU's thread organization.
C++ Implementation
@@ -147,44 +171,56 @@ Y-space (Yield Space) represents the logical organization of work within each th
Work Assignment Structure
~~~~~~~~~~~~~~~~~~~~~~~~~
-.. raw:: html
-
-
- graph TB
- subgraph "Thread's Tile (2x2 elements)"
- Y00["Y=[0,0] Element 0"]
- Y01["Y=[0,1] Element 1"]
- Y10["Y=[1,0] Element 2"]
- Y11["Y=[1,1] Element 3"]
- end
-
- subgraph "Y-space Structure"
- YS["Each thread processes the same Y-space pattern but at different X locations"]
- end
-
- subgraph "Example: 4 Threads"
- T0["Thread 0 P=[0,0]"]
- T1["Thread 1 P=[0,1]"]
- T2["Thread 2 P=[1,0]"]
- T3["Thread 3 P=[1,1]"]
- end
-
- Y00 --> YS
- Y01 --> YS
- Y10 --> YS
- Y11 --> YS
-
- T0 --> YS
- T1 --> YS
- T2 --> YS
- T3 --> YS
-
- style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Thread's Tile (2x2 elements)"
+ Y00["Y=[0,0] Element 0"]
+ Y01["Y=[0,1] Element 1"]
+ Y10["Y=[1,0] Element 2"]
+ Y11["Y=[1,1] Element 3"]
+ end
+
+ subgraph "Y-space Structure"
+ YS["Each thread processes the same Y-space pattern but at different X locations"]
+ end
+
+ subgraph "Example: 4 Threads"
+ T0["Thread 0 P=[0,0]"]
+ T1["Thread 1 P=[0,1]"]
+ T2["Thread 2 P=[1,0]"]
+ T3["Thread 3 P=[1,1]"]
+ end
+
+ Y00 --> YS
+ Y01 --> YS
+ Y10 --> YS
+ Y11 --> YS
+
+ T0 --> YS
+ T1 --> YS
+ T2 --> YS
+ T3 --> YS
+
+ style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/coordinate_systems_3.svg
+ :alt: Diagram
+ :align: center
The power of Y-space lies in its ability to express different iteration patterns without changing the underlying distribution logic. A thread might traverse its Y-space in row-major order for one algorithm, column-major for another, or even use :ref:`space-filling curves ` for optimal cache utilization. This flexibility enables algorithm-specific optimizations while maintaining a consistent framework.
Hierarchical Y-Space
@@ -265,37 +301,49 @@ The transformation from P and Y coordinates to X coordinates represents the hear
Transformation Pipeline
~~~~~~~~~~~~~~~~~~~~~~~
-.. raw:: html
-
-
- graph LR
- subgraph "Input"
- P["P-coordinates Thread identity P=[1,0]"]
- Y["Y-coordinates Element in tile Y=[0,1]"]
- end
-
- subgraph "Transformation"
- T["P + Y → X Base position + Offset"]
- end
-
- subgraph "Output"
- X["X-coordinates Tensor position X=[2,1]"]
- end
-
- subgraph "Example"
- E["Thread P=[1,0] at base (2,0) Element Y=[0,1] adds offset (0,1) Result X=[2,1] in tensor"]
- end
-
- P --> T
- Y --> T
- T --> X
- X --> E
-
- style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph LR
+ subgraph "Input"
+ P["P-coordinates Thread identity P=[1,0]"]
+ Y["Y-coordinates Element in tile Y=[0,1]"]
+ end
+
+ subgraph "Transformation"
+ T["P + Y → X Base position + Offset"]
+ end
+
+ subgraph "Output"
+ X["X-coordinates Tensor position X=[2,1]"]
+ end
+
+ subgraph "Example"
+ E["Thread P=[1,0] at base (2,0) Element Y=[0,1] adds offset (0,1) Result X=[2,1] in tensor"]
+ end
+
+ P --> T
+ Y --> T
+ T --> X
+ X --> E
+
+ style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/coordinate_systems_4.svg
+ :alt: Diagram
+ :align: center
Mathematical Foundation
~~~~~~~~~~~~~~~~~~~~~~~
@@ -361,33 +409,45 @@ D-space represents the final transformation in the coordinate pipeline—convert
Linearization Strategies
~~~~~~~~~~~~~~~~~~~~~~~~
-.. raw:: html
-
-
- graph LR
- subgraph "X-coordinates"
- X["X = [2, 3] 2D Position"]
- end
-
- subgraph "Layout Options"
- RM["Row-Major D = 2×width + 3"]
- CM["Column-Major D = 3×height + 2"]
- BL["Blocked Complex pattern"]
- end
-
- subgraph "D-coordinate"
- D["D = 11 Linear Address"]
- end
-
- X --> RM
- X --> CM
- X --> BL
- RM --> D
-
- style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph LR
+ subgraph "X-coordinates"
+ X["X = [2, 3] 2D Position"]
+ end
+
+ subgraph "Layout Options"
+ RM["Row-Major D = 2×width + 3"]
+ CM["Column-Major D = 3×height + 2"]
+ BL["Blocked Complex pattern"]
+ end
+
+ subgraph "D-coordinate"
+ D["D = 11 Linear Address"]
+ end
+
+ X --> RM
+ X --> CM
+ X --> BL
+ RM --> D
+
+ style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/coordinate_systems_5.svg
+ :alt: Diagram
+ :align: center
The linearization process must consider multiple factors:
.. code-block:: cpp
@@ -417,48 +477,60 @@ Complete Pipeline Example
Let's trace through a complete example showing how all coordinate spaces work together:
-.. raw:: html
-
-
- graph TB
- subgraph "Step 1: Thread Identification"
- TID["Thread ID = 5"]
- P["P-coordinates P = [0, 5] (warp 0, lane 5)"]
- end
-
- subgraph "Step 2: Work Assignment"
- Y["Y-coordinates Y = [1, 0] (element in tile)"]
- end
-
- subgraph "Step 3: P+Y Transformation"
- TRANS["P + Y → X Thread position + Element offset"]
- X["X-coordinates X = [1, 5] (tensor position)"]
- end
-
- subgraph "Step 4: Linearization"
- LIN["X → D Row-major: D = x₀ × width + x₁"]
- D["D-coordinate D = 13 (memory address)"]
- end
-
- subgraph "Step 5: Memory Access"
- MEM["Hardware accesses memory[13]"]
- end
-
- TID --> P
- P --> TRANS
- Y --> TRANS
- TRANS --> X
- X --> LIN
- LIN --> D
- D --> MEM
-
- style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
- style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
- style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Step 1: Thread Identification"
+ TID["Thread ID = 5"]
+ P["P-coordinates P = [0, 5] (warp 0, lane 5)"]
+ end
+
+ subgraph "Step 2: Work Assignment"
+ Y["Y-coordinates Y = [1, 0] (element in tile)"]
+ end
+
+ subgraph "Step 3: P+Y Transformation"
+ TRANS["P + Y → X Thread position + Element offset"]
+ X["X-coordinates X = [1, 5] (tensor position)"]
+ end
+
+ subgraph "Step 4: Linearization"
+ LIN["X → D Row-major: D = x₀ × width + x₁"]
+ D["D-coordinate D = 13 (memory address)"]
+ end
+
+ subgraph "Step 5: Memory Access"
+ MEM["Hardware accesses memory[13]"]
+ end
+
+ TID --> P
+ P --> TRANS
+ Y --> TRANS
+ TRANS --> X
+ X --> LIN
+ LIN --> D
+ D --> MEM
+
+ style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
+ style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
+ style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
+
+
+
+
+
+.. image:: diagrams/coordinate_systems_6.svg
+ :alt: Diagram
+ :align: center
Real-World Example: Matrix Multiplication
-----------------------------------------
diff --git a/docs/conceptual/ck_tile/descriptors.rst b/docs/conceptual/ck_tile/descriptors.rst
index 9e980ca3dc..d385e86759 100644
--- a/docs/conceptual/ck_tile/descriptors.rst
+++ b/docs/conceptual/ck_tile/descriptors.rst
@@ -99,35 +99,43 @@ Every TensorDescriptor in CK Tile can be thought of as a **transformation pipeli
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph LR
- subgraph "Pipeline Stages"
- S1["Stage 1 Base Layout [M, N]"]
- S2["Stage 2 Transform Unmerge"]
- S3["Stage 3 New View [M1, M2, N]"]
- S4["Stage N Final View [...]"]
- end
-
- subgraph "Same Data"
- D["Physical Memory No data movement"]
- end
-
- S1 --> S2
- S2 --> S3
- S3 --> S4
-
- S1 -.-> D
- S2 -.-> D
- S3 -.-> D
- S4 -.-> D
-
- style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
- style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ .. mermaid::
+
+ graph LR
+ subgraph "Pipeline Stages"
+ S1["Stage 1 Base Layout [M, N]"]
+ S2["Stage 2 Transform Unmerge"]
+ S3["Stage 3 New View [M1, M2, N]"]
+ S4["Stage N Final View [...]"]
+ end
+
+ subgraph "Same Data"
+ D["Physical Memory No data movement"]
+ end
+
+ S1 --> S2
+ S2 --> S3
+ S3 --> S4
+
+ S1 -.-> D
+ S2 -.-> D
+ S3 -.-> D
+ S4 -.-> D
+
+ style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
+ style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+.. image:: diagrams/descriptors_1.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/descriptors_1.svg
:alt: Diagram
:align: center
@@ -197,40 +205,48 @@ Analysis of the Final Pipeline
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "Transform Pipeline"
- T0["Transform 0 Base Unmerge Input: [0] Output: [1,2]"]
- T1["Transform 1 PassThrough Input: [1] Output: [3]"]
- T2["Transform 2 Unmerge Input: [2] Output: [4,5]"]
- end
-
- subgraph "Hidden Dimensions"
- H0["Hidden ID 0 Raw Buffer"]
- H1["Hidden ID 1 Dim 0 (size 2)"]
- H2["Hidden ID 2 Dim 1 (size 6)"]
- H3["Hidden ID 3 Final Dim 0"]
- H4["Hidden ID 4 Final Dim 1"]
- H5["Hidden ID 5 Final Dim 2"]
- end
-
- H0 --> T0
- T0 --> H1
- T0 --> H2
- H1 --> T1
- H2 --> T2
- T1 --> H3
- T2 --> H4
- T2 --> H5
-
- style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
- style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ .. mermaid::
+
+ graph TB
+ subgraph "Transform Pipeline"
+ T0["Transform 0 Base Unmerge Input: [0] Output: [1,2]"]
+ T1["Transform 1 PassThrough Input: [1] Output: [3]"]
+ T2["Transform 2 Unmerge Input: [2] Output: [4,5]"]
+ end
+
+ subgraph "Hidden Dimensions"
+ H0["Hidden ID 0 Raw Buffer"]
+ H1["Hidden ID 1 Dim 0 (size 2)"]
+ H2["Hidden ID 2 Dim 1 (size 6)"]
+ H3["Hidden ID 3 Final Dim 0"]
+ H4["Hidden ID 4 Final Dim 1"]
+ H5["Hidden ID 5 Final Dim 2"]
+ end
+
+ H0 --> T0
+ T0 --> H1
+ T0 --> H2
+ H1 --> T1
+ H2 --> T2
+ T1 --> H3
+ T2 --> H4
+ T2 --> H5
+
+ style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
+ style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+.. image:: diagrams/descriptors_2.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/descriptors_2.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg
new file mode 100644
index 0000000000..fb696c9e42
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg
new file mode 100644
index 0000000000..7a58311b33
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg
new file mode 100644
index 0000000000..8e20da9fa0
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg
new file mode 100644
index 0000000000..f0b04d283b
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/convolution_example.svg b/docs/conceptual/ck_tile/diagrams/convolution_example.svg
index 1c36cf4f58..4a86641997 100644
--- a/docs/conceptual/ck_tile/diagrams/convolution_example.svg
+++ b/docs/conceptual/ck_tile/diagrams/convolution_example.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg
new file mode 100644
index 0000000000..8890aa2362
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg
new file mode 100644
index 0000000000..765318910a
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg
new file mode 100644
index 0000000000..47846dfe4b
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg
new file mode 100644
index 0000000000..3a9f04c73d
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg
new file mode 100644
index 0000000000..f91d8b39ef
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg
new file mode 100644
index 0000000000..0e0275457a
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg b/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg
new file mode 100644
index 0000000000..55253de744
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg b/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg
new file mode 100644
index 0000000000..524b6b2d40
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg
new file mode 100644
index 0000000000..41338c8902
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg
new file mode 100644
index 0000000000..f57636d293
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg
new file mode 100644
index 0000000000..df13db0c0d
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg
new file mode 100644
index 0000000000..8e521229cf
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg
new file mode 100644
index 0000000000..2faec8d8d3
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg
new file mode 100644
index 0000000000..19e7140013
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg
new file mode 100644
index 0000000000..6f588a46c4
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg
new file mode 100644
index 0000000000..0974e138fd
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg
new file mode 100644
index 0000000000..894151380d
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg
new file mode 100644
index 0000000000..2e46ee58cf
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg
new file mode 100644
index 0000000000..2195465e60
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg
new file mode 100644
index 0000000000..e9ec5a5780
--- /dev/null
+++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/diagrams/tile_window_2.svg b/docs/conceptual/ck_tile/diagrams/tile_window_2.svg
index a673c72866..60ec2dd1ce 100644
--- a/docs/conceptual/ck_tile/diagrams/tile_window_2.svg
+++ b/docs/conceptual/ck_tile/diagrams/tile_window_2.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/docs/conceptual/ck_tile/introduction_motivation.rst b/docs/conceptual/ck_tile/introduction_motivation.rst
index 66df4187fb..33aaf19627 100644
--- a/docs/conceptual/ck_tile/introduction_motivation.rst
+++ b/docs/conceptual/ck_tile/introduction_motivation.rst
@@ -17,75 +17,81 @@ In this introduction, we establish the fundamental problems that tile distributi
The GPU Memory Problem
----------------------
-.. raw:: html
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Random Access Pattern (Inefficient)"
+ subgraph "Threads"
+ T0_R["Thread 0"]
+ T1_R["Thread 1"]
+ T2_R["Thread 2"]
+ T3_R["Thread 3"]
+ end
-
+ subgraph "Memory"
+ M0["Mem[0]"]
+ M7["Mem[7]"]
+ M15["Mem[15]"]
+ M23["Mem[23]"]
+ M31["Mem[31]"]
+ M39["Mem[39]"]
+ M47["Mem[47]"]
+ M55["Mem[55]"]
+ end
+ T0_R -.-> M23
+ T1_R -.-> M7
+ T2_R -.-> M47
+ T3_R -.-> M15
+ end
+
+ subgraph "Tile Distribution Pattern (Efficient)"
+ subgraph "Threads_TD"
+ T0_TD["Thread 0"]
+ T1_TD["Thread 1"]
+ T2_TD["Thread 2"]
+ T3_TD["Thread 3"]
+ end
+
+ subgraph "Memory_TD"
+ M0_TD["Mem[0]"]
+ M1_TD["Mem[1]"]
+ M2_TD["Mem[2]"]
+ M3_TD["Mem[3]"]
+ M4_TD["Mem[4]"]
+ M5_TD["Mem[5]"]
+ M6_TD["Mem[6]"]
+ M7_TD["Mem[7]"]
+ end
+
+ T0_TD --> M0_TD
+ T0_TD --> M1_TD
+ T1_TD --> M2_TD
+ T1_TD --> M3_TD
+ T2_TD --> M4_TD
+ T2_TD --> M5_TD
+ T3_TD --> M6_TD
+ T3_TD --> M7_TD
+ end
+
+ style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+
+ style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+
+.. image:: diagrams/introduction_motivation_1.svg
+ :alt: Diagram
+ :align: center
Why Random Memory Access is Slow
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -209,36 +215,42 @@ The Coordinate Mapping Insight
At the heart of tile distribution lies a profound mathematical insight: efficient GPU computation requires a systematic framework for mapping between different coordinate spaces. This framework transforms the complex problem of thread-to-data assignment into a series of well-defined mathematical transformations, each serving a specific purpose in the journey from abstract algorithm to concrete hardware execution.
-.. raw:: html
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph LR
+ subgraph "Coordinate Spaces"
+ P["P-space Thread Position (thread_x, thread_y, warp_id, block_id)"]
+ Y["Y-space Local Data (y0, y1, y2, y3)"]
+ X["X-space Global Position (x0, x1)"]
+ D["D-space Memory Address (linearized)"]
+ end
-
- graph LR
- subgraph "Coordinate Spaces"
- P["P-space Thread Position (thread_x, thread_y, warp_id, block_id)"]
- Y["Y-space Local Data (y0, y1, y2, y3)"]
- X["X-space Global Position (x0, x1)"]
- D["D-space Memory Address (linearized)"]
- end
-
- subgraph "Transformations"
- T1["P + Y → X Thread data mapping"]
- T2["X → D Memory linearization"]
- end
-
- P --> T1
- Y --> T1
- T1 --> X
- X --> T2
- T2 --> D
-
- style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
- style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
- style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-
+ subgraph "Transformations"
+ T1["P + Y → X Thread data mapping"]
+ T2["X → D Memory linearization"]
+ end
+ P --> T1
+ Y --> T1
+ T1 --> X
+ X --> T2
+ T2 --> D
+
+ style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+ style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+ style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+
+
+
+.. image:: diagrams/introduction_motivation_2.svg
+ :alt: Diagram
+ :align: center
The elegance of this approach emerges from its separation of concerns. Each coordinate space represents a distinct aspect of the computation, and the transformations between them encapsulate specific optimization strategies. This separation allows developers to reason about their algorithms in natural terms while the framework handles the complex mapping to efficient hardware execution patterns.
**P-space (Thread Position Space)** represents the physical organization of threads on the GPU. This space captures the hierarchical nature of GPU execution, from individual threads identified by their x and y coordinates within a block, to warps that execute in lockstep, to thread blocks that share resources. The coordinates in P-space—thread_x, thread_y, warp_id, and block_id—directly correspond to the hardware's execution model. Understanding P-space is crucial because it determines which threads can cooperate efficiently through shared memory and which threads will execute their memory accesses simultaneously.
diff --git a/docs/conceptual/ck_tile/lds_index_swapping.rst b/docs/conceptual/ck_tile/lds_index_swapping.rst
index 35211d0368..3b246dad67 100644
--- a/docs/conceptual/ck_tile/lds_index_swapping.rst
+++ b/docs/conceptual/ck_tile/lds_index_swapping.rst
@@ -28,42 +28,50 @@ The original K coordinate is split into K0 and K1, where K1 represents the threa
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "3D LDS coordinate [K0, M, K1]"
- K0["KPerBlock/KPack * MLdsLayer K0"]
- M["MPerBlock/MLdsLayer M"]
- K1["KPack K1"]
- end
+ .. mermaid::
- subgraph "XOR Transform"
- XT["make_xor_transform"]
- end
+ graph TB
+ subgraph "3D LDS coordinate [K0, M, K1]"
+ K0["KPerBlock/KPack * MLdsLayer K0"]
+ M["MPerBlock/MLdsLayer M"]
+ K1["KPack K1"]
+ end
+
+ subgraph "XOR Transform"
+ XT["make_xor_transform"]
+ end
+
+ subgraph "Update K0 with XOR transformation"
+ K01["KPerBlock/KPack * MLdsLayer K0'"]
+ M1["MPerBlock/MLdsLayer M"]
+ K11["KPack K1"]
+ end
+
+ K0 --> XT
+ M --> XT
+ K1 --> K11
+
+ XT --> K01
+ XT --> M1
+
+ style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+
+ style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- subgraph "Update K0 with XOR transformation"
- K01["KPerBlock/KPack * MLdsLayer K0'"]
- M1["MPerBlock/MLdsLayer M"]
- K11["KPack K1"]
- end
- K0 --> XT
- M --> XT
- K1 --> K11
-
- XT --> K01
- XT --> M1
-
- style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-
- style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+.. image:: diagrams/lds_index_swapping_1.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/lds_index_swapping_1.svg
:alt: Diagram
:align: center
@@ -83,45 +91,53 @@ The transformed K0' is split into L and K0'' components, creating an intermediat
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "3D LDS coordinate [K0', M, K1]"
- K0["KPerBlock/KPack * MLdsLayer K0'"]
- M["MPerBlock/MLdsLayer M"]
- K1["KPack K1"]
- end
+ .. mermaid::
- subgraph "Unmerge into 2 components"
- UM["make_unmerge_transform"]
- end
+ graph TB
+ subgraph "3D LDS coordinate [K0', M, K1]"
+ K0["KPerBlock/KPack * MLdsLayer K0'"]
+ M["MPerBlock/MLdsLayer M"]
+ K1["KPack K1"]
+ end
+
+ subgraph "Unmerge into 2 components"
+ UM["make_unmerge_transform"]
+ end
+
+ subgraph "4D intermediate transformation space"
+ L["MLdsLayer L"]
+ M1["MPerBlock/MLdsLayer M"]
+ K01["KPerBlock/KPack K0''"]
+ K11["KPack K1"]
+ end
+
+ K0 --> UM
+ M --> M1
+ K1 --> K11
+
+ UM --> L
+ UM --> K01
+
+ style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+
+ style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+ style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- subgraph "4D intermediate transformation space"
- L["MLdsLayer L"]
- M1["MPerBlock/MLdsLayer M"]
- K01["KPerBlock/KPack K0''"]
- K11["KPack K1"]
- end
- K0 --> UM
- M --> M1
- K1 --> K11
-
- UM --> L
- UM --> K01
-
- style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-
- style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-
- style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+.. image:: diagrams/lds_index_swapping_2.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/lds_index_swapping_2.svg
:alt: Diagram
:align: center
@@ -142,50 +158,58 @@ The final step merges the 4D coordinates back into 2D transformed coordinates (M
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "4D LDS Coordinates [L, M, K0'', K1]"
- L["MLdsLayer L"]
- M1["MPerBlock/MLdsLayer M"]
- K0["KPerBlock/KPack K0''"]
- K1["KPack K1"]
- end
+ .. mermaid::
- subgraph "Merge into 1 component"
- ME0["make_merge_transform"]
- end
+ graph TB
+ subgraph "4D LDS Coordinates [L, M, K0'', K1]"
+ L["MLdsLayer L"]
+ M1["MPerBlock/MLdsLayer M"]
+ K0["KPerBlock/KPack K0''"]
+ K1["KPack K1"]
+ end
+
+ subgraph "Merge into 1 component"
+ ME0["make_merge_transform"]
+ end
+
+ subgraph "Merge into 1 component"
+ ME1["make_merge_transform"]
+ end
+
+ subgraph "Transformed 2D coordinates [M', K']"
+ M11["MPerBlock M'"]
+ K01["KPerBlock K'"]
+ end
+
+ L --> ME0
+ M1 --> ME0
+
+ K0 --> ME1
+ K1 --> ME1
+
+ ME0 --> M11
+ ME1 --> K01
+
+ style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+
+ style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- subgraph "Merge into 1 component"
- ME1["make_merge_transform"]
- end
- subgraph "Transformed 2D coordinates [M', K']"
- M11["MPerBlock M'"]
- K01["KPerBlock K'"]
- end
- L --> ME0
- M1 --> ME0
- K0 --> ME1
- K1 --> ME1
-
- ME0 --> M11
- ME1 --> K01
-
- style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
-
- style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-
-
+.. image:: diagrams/lds_index_swapping_3.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/lds_index_swapping_3.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/load_store_traits.rst b/docs/conceptual/ck_tile/load_store_traits.rst
index 7816157d0c..f269a406f0 100644
--- a/docs/conceptual/ck_tile/load_store_traits.rst
+++ b/docs/conceptual/ck_tile/load_store_traits.rst
@@ -106,27 +106,35 @@ LoadStoreTraits employs a advanced algorithm to select the best dimension for ve
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TD
- A[Analyze Distribution] --> B{Check Each Dimension}
- B --> C[Calculate Stride]
- C --> D{Stride == 1?}
- D -->|Yes| E[Candidate for Vectorization]
- D -->|No| F[Skip Dimension]
- E --> G[Check Alignment]
- G --> H[Check Vector Size]
- H --> I[Score Dimension]
- F --> B
- I --> J[Select Best Dimension]
- J --> K[Configure Vector Access]
-
- style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ .. mermaid::
+
+ graph TD
+ A[Analyze Distribution] --> B{Check Each Dimension}
+ B --> C[Calculate Stride]
+ C --> D{Stride == 1?}
+ D -->|Yes| E[Candidate for Vectorization]
+ D -->|No| F[Skip Dimension]
+ E --> G[Check Alignment]
+ G --> H[Check Vector Size]
+ H --> I[Score Dimension]
+ F --> B
+ I --> J[Select Best Dimension]
+ J --> K[Configure Vector Access]
+
+ style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+
+
+.. image:: diagrams/load_store_traits_1.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/load_store_traits_1.svg
:alt: Diagram
:align: center
@@ -174,36 +182,44 @@ LoadStoreTraits creates efficient access patterns using space-filling curves:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph LR
- subgraph "Linear Traversal"
- L1["0→1→2→3"]
- L2["4→5→6→7"]
- L3["Cache miss"]
- L4["8→9→10→11"]
- end
-
- subgraph "Snake Pattern"
- S1["0→1→2→3"]
- S2["7←6←5←4"]
- S3["Cache hit!"]
- S4["8→9→10→11"]
- end
-
- L1 --> L2
- L2 --> L3
- L3 --> L4
-
- S1 --> S2
- S2 --> S3
- S3 --> S4
-
- style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
- style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ .. mermaid::
+
+ graph LR
+ subgraph "Linear Traversal"
+ L1["0→1→2→3"]
+ L2["4→5→6→7"]
+ L3["Cache miss"]
+ L4["8→9→10→11"]
+ end
+
+ subgraph "Snake Pattern"
+ S1["0→1→2→3"]
+ S2["7←6←5←4"]
+ S3["Cache hit!"]
+ S4["8→9→10→11"]
+ end
+
+ L1 --> L2
+ L2 --> L3
+ L3 --> L4
+
+ S1 --> S2
+ S2 --> S3
+ S3 --> S4
+
+ style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+.. image:: diagrams/load_store_traits_2.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/load_store_traits_2.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/space_filling_curve.rst b/docs/conceptual/ck_tile/space_filling_curve.rst
index 5d98f2fd3d..67b9b53824 100644
--- a/docs/conceptual/ck_tile/space_filling_curve.rst
+++ b/docs/conceptual/ck_tile/space_filling_curve.rst
@@ -188,36 +188,44 @@ The snake pattern reverses traversal direction on alternate rows, minimizing the
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph LR
- subgraph "Linear Pattern"
- L1["Row 0: →"]
- L2["Row 1: →"]
- L3["Jump back"]
- L4["Row 2: →"]
- end
-
- subgraph "Snake Pattern"
- S1["Row 0: →"]
- S2["Row 1: ←"]
- S3["Continue"]
- S4["Row 2: →"]
- end
-
- L1 --> L3
- L3 --> L2
- L2 --> L3
- L3 --> L4
-
- S1 --> S2
- S2 --> S4
-
- style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
- style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ .. mermaid::
+
+ graph LR
+ subgraph "Linear Pattern"
+ L1["Row 0: →"]
+ L2["Row 1: →"]
+ L3["Jump back"]
+ L4["Row 2: →"]
+ end
+
+ subgraph "Snake Pattern"
+ S1["Row 0: →"]
+ S2["Row 1: ←"]
+ S3["Continue"]
+ S4["Row 2: →"]
+ end
+
+ L1 --> L3
+ L3 --> L2
+ L2 --> L3
+ L3 --> L4
+
+ S1 --> S2
+ S2 --> S4
+
+ style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+.. image:: diagrams/space_filling_curve.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/space_filling_curve.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/static_distributed_tensor.rst b/docs/conceptual/ck_tile/static_distributed_tensor.rst
index fe7ca2a365..a454a8ca38 100644
--- a/docs/conceptual/ck_tile/static_distributed_tensor.rst
+++ b/docs/conceptual/ck_tile/static_distributed_tensor.rst
@@ -88,21 +88,29 @@ The memory layout follows a hierarchical pattern:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TD
- A[Global Tensor 64x64] --> B[Thread Block 16x16]
- B --> C[Thread 0,0 Elements 0:3,0:3]
- B --> D[Thread 0,1 Elements 0:3,4:7]
- B --> E[Thread 1,0 Elements 4:7,0:3]
- B --> F[...]
-
- C --> G[Local Array 16 elements]
- D --> H[Local Array 16 elements]
- E --> I[Local Array 16 elements]
+ .. mermaid::
+
+ graph TD
+ A[Global Tensor 64x64] --> B[Thread Block 16x16]
+ B --> C[Thread 0,0 Elements 0:3,0:3]
+ B --> D[Thread 0,1 Elements 0:3,4:7]
+ B --> E[Thread 1,0 Elements 4:7,0:3]
+ B --> F[...]
+
+ C --> G[Local Array 16 elements]
+ D --> H[Local Array 16 elements]
+ E --> I[Local Array 16 elements]
+
+
+.. image:: diagrams/static_distributed_tensor.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/static_distributed_tensor.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/sweep_tile.rst b/docs/conceptual/ck_tile/sweep_tile.rst
index 7ad7040764..25208cae05 100644
--- a/docs/conceptual/ck_tile/sweep_tile.rst
+++ b/docs/conceptual/ck_tile/sweep_tile.rst
@@ -18,35 +18,43 @@ The key insight is the "load once, use many times" pattern. Load X data once int
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- flowchart LR
- subgraph "X-Tile (Reused)"
- XT["X data loaded once Stays in registers"]
- end
-
- subgraph "Y-Sweep"
- Y1["Y position 0"]
- Y2["Y position 1"]
- Y3["Y position 2"]
- YN["Y position N"]
- end
-
- subgraph "Computation"
- C["Process(X, Y)"]
- end
-
- XT --> C
- Y1 --> C
- Y2 --> C
- Y3 --> C
- YN --> C
-
- style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+ .. mermaid::
+
+ flowchart LR
+ subgraph "X-Tile (Reused)"
+ XT["X data loaded once Stays in registers"]
+ end
+
+ subgraph "Y-Sweep"
+ Y1["Y position 0"]
+ Y2["Y position 1"]
+ Y3["Y position 2"]
+ YN["Y position N"]
+ end
+
+ subgraph "Computation"
+ C["Process(X, Y)"]
+ end
+
+ XT --> C
+ Y1 --> C
+ Y2 --> C
+ Y3 --> C
+ YN --> C
+
+ style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+
+
+.. image:: diagrams/sweep_tile_1.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/sweep_tile_1.svg
:alt: Diagram
:align: center
@@ -122,30 +130,38 @@ The sweep pattern provides significant memory efficiency benefits:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "Traditional Approach"
- T1["Load X[0]"] --> P1["Process"]
- T2["Load Y[0]"] --> P1
- T3["Load X[0]"] --> P2["Process"]
- T4["Load Y[1]"] --> P2
- T5["Load X[0]"] --> P3["Process"]
- T6["Load Y[2]"] --> P3
- Note1["X loaded 3 times!"]
- end
-
- subgraph "Sweep Approach"
- S1["Load X[0]"] --> SP["Process with Y[0], Y[1], Y[2]"]
- S2["Load Y[0,1,2]"] --> SP
- Note2["X loaded once!"]
- end
-
- style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
- style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ .. mermaid::
+
+ graph TB
+ subgraph "Traditional Approach"
+ T1["Load X[0]"] --> P1["Process"]
+ T2["Load Y[0]"] --> P1
+ T3["Load X[0]"] --> P2["Process"]
+ T4["Load Y[1]"] --> P2
+ T5["Load X[0]"] --> P3["Process"]
+ T6["Load Y[2]"] --> P3
+ Note1["X loaded 3 times!"]
+ end
+
+ subgraph "Sweep Approach"
+ S1["Load X[0]"] --> SP["Process with Y[0], Y[1], Y[2]"]
+ S2["Load Y[0,1,2]"] --> SP
+ Note2["X loaded once!"]
+ end
+
+ style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+.. image:: diagrams/sweep_tile_2.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/sweep_tile_2.svg
:alt: Diagram
:align: center
@@ -381,37 +397,45 @@ Sweep operations provide several performance benefits:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "Sweep Performance Benefits"
- B1["Zero runtime overhead Compile-time unrolling"]
- B2["Perfect memory coalescing Sequential access patterns"]
- B3["Automatic vectorization Compiler optimizations"]
- B4["Register reuse X data stays in VGPR"]
- end
-
- subgraph "Use Cases"
- U1["Matrix Multiplication Reuse A columns"]
- U2["Convolution Reuse filter weights"]
- U3["Reduction Accumulate over Y"]
- U4["Broadcast Apply X to all Y"]
- end
-
- B1 --> Performance["High Performance"]
- B2 --> Performance
- B3 --> Performance
- B4 --> Performance
-
- Performance --> U1
- Performance --> U2
- Performance --> U3
- Performance --> U4
-
- style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px
+ .. mermaid::
+
+ graph TB
+ subgraph "Sweep Performance Benefits"
+ B1["Zero runtime overhead Compile-time unrolling"]
+ B2["Perfect memory coalescing Sequential access patterns"]
+ B3["Automatic vectorization Compiler optimizations"]
+ B4["Register reuse X data stays in VGPR"]
+ end
+
+ subgraph "Use Cases"
+ U1["Matrix Multiplication Reuse A columns"]
+ U2["Convolution Reuse filter weights"]
+ U3["Reduction Accumulate over Y"]
+ U4["Broadcast Apply X to all Y"]
+ end
+
+ B1 --> Performance["High Performance"]
+ B2 --> Performance
+ B3 --> Performance
+ B4 --> Performance
+
+ Performance --> U1
+ Performance --> U2
+ Performance --> U3
+ Performance --> U4
+
+ style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px
+
+
+.. image:: diagrams/sweep_tile_3.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/sweep_tile_3.svg
:alt: Diagram
:align: center
@@ -453,28 +477,36 @@ Complete workflow example:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- flowchart TB
- subgraph "Complete Workflow"
- TD["TileDistribution Define data layout"]
- TW["TileWindow Create view"]
- DT["DistributedTensor Load X data"]
- ST["SweepTile Iterate Y positions"]
- R["Results Store outputs"]
- end
-
- TD --> TW
- TW --> DT
- DT --> ST
- ST --> R
-
- style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ .. mermaid::
+
+ flowchart TB
+ subgraph "Complete Workflow"
+ TD["TileDistribution Define data layout"]
+ TW["TileWindow Create view"]
+ DT["DistributedTensor Load X data"]
+ ST["SweepTile Iterate Y positions"]
+ R["Results Store outputs"]
+ end
+
+ TD --> TW
+ TW --> DT
+ DT --> ST
+ ST --> R
+
+ style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+.. image:: diagrams/sweep_tile_4.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/sweep_tile_4.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/tensor_views.rst b/docs/conceptual/ck_tile/tensor_views.rst
index 7866eb9e58..0ba4bb4e82 100644
--- a/docs/conceptual/ck_tile/tensor_views.rst
+++ b/docs/conceptual/ck_tile/tensor_views.rst
@@ -13,40 +13,52 @@ The power of TensorView lies in its ability to present different logical views o
TensorView Architecture
-----------------------
-.. raw:: html
-
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "BufferView"
+ BV1["Linear indexing only"]
+ BV2["buffer[5]"]
+ BV3["No shape information"]
+ BV4["Direct memory access"]
+ end
+
+ subgraph "TensorView"
+ TV1["Multi-dimensional indexing"]
+ TV2["tensor(1, 2)"]
+ TV3["Shape-aware operations"]
+ TV4["Coordinate transformations"]
+ end
+
+ subgraph "Use Cases"
+ UC1["BufferView: Low-level memory ops"]
+ UC2["TensorView: Matrix/tensor algorithms"]
+ end
+
+ BV1 --> UC1
+ TV1 --> UC2
+
+ style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
+ style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tensor_views_5.svg
+ :alt: Diagram
+ :align: center
BufferView excels at raw memory operations where linear access is natural or where the overhead of coordinate calculation would be prohibitive. TensorView shines when algorithms naturally think in terms of multi-dimensional coordinates, such as matrix operations, image processing, or tensor contractions.
Integration with Tile Distribution
diff --git a/docs/conceptual/ck_tile/thread_mapping.rst b/docs/conceptual/ck_tile/thread_mapping.rst
index 69135c54fc..29dd817b9f 100644
--- a/docs/conceptual/ck_tile/thread_mapping.rst
+++ b/docs/conceptual/ck_tile/thread_mapping.rst
@@ -82,53 +82,61 @@ Composable Kernel abstracts thread identification into partition indices, buildi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "GPU Device"
- subgraph "Thread Block"
- subgraph "Warp 0"
- T0["Thread 0 lane_id=0"]
- T1["Thread 1 lane_id=1"]
- T2["..."]
- T31["Thread 31 lane_id=31"]
- end
-
- subgraph "Warp 1"
- T32["Thread 32 lane_id=0"]
- T33["Thread 33 lane_id=1"]
- T34["..."]
- T63["Thread 63 lane_id=31"]
- end
-
- W2["Warp 2"]
- W3["..."]
- W7["Warp 7"]
- end
- end
-
- subgraph "Thread Identification"
- TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"]
- WID["Warp ID = threadIdx.x / 32"]
- LID["Lane ID = threadIdx.x % 32"]
- end
-
- subgraph "P-space Mapping"
- P["P-coordinates NDimP=1: [thread_id] NDimP=2: [warp_id, lane_id]"]
- end
-
- T0 --> TID
- TID --> WID
- TID --> LID
- WID --> P
- LID --> P
-
- style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ .. mermaid::
+
+ graph TB
+ subgraph "GPU Device"
+ subgraph "Thread Block"
+ subgraph "Warp 0"
+ T0["Thread 0 lane_id=0"]
+ T1["Thread 1 lane_id=1"]
+ T2["..."]
+ T31["Thread 31 lane_id=31"]
+ end
+
+ subgraph "Warp 1"
+ T32["Thread 32 lane_id=0"]
+ T33["Thread 33 lane_id=1"]
+ T34["..."]
+ T63["Thread 63 lane_id=31"]
+ end
+
+ W2["Warp 2"]
+ W3["..."]
+ W7["Warp 7"]
+ end
+ end
+
+ subgraph "Thread Identification"
+ TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"]
+ WID["Warp ID = threadIdx.x / 32"]
+ LID["Lane ID = threadIdx.x % 32"]
+ end
+
+ subgraph "P-space Mapping"
+ P["P-coordinates NDimP=1: [thread_id] NDimP=2: [warp_id, lane_id]"]
+ end
+
+ T0 --> TID
+ TID --> WID
+ TID --> LID
+ WID --> P
+ LID --> P
+
+ style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+
+
+.. image:: diagrams/thread_mapping_1.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/thread_mapping_1.svg
:alt: Diagram
:align: center
@@ -179,43 +187,51 @@ Once threads know their IDs, they need to map those IDs to specific data element
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "Thread to Data Mapping"
- subgraph "Thread Grid"
- T00["Thread[0,0] Warp 0"]
- T01["Thread[0,1] Warp 0"]
- T10["Thread[1,0] Warp 1"]
- T11["Thread[1,1] Warp 1"]
- end
-
- subgraph "Data Tiles"
- D00["Data[0:4, 0:4] 16 elements"]
- D01["Data[0:4, 4:8] 16 elements"]
- D10["Data[4:8, 0:4] 16 elements"]
- D11["Data[4:8, 4:8] 16 elements"]
- end
-
- subgraph "Memory Access"
- MA["Coalesced Access Adjacent threads → Adjacent memory"]
- end
- end
-
- T00 --> D00
- T01 --> D01
- T10 --> D10
- T11 --> D11
-
- D00 --> MA
- D01 --> MA
-
- style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ .. mermaid::
+
+ graph TB
+ subgraph "Thread to Data Mapping"
+ subgraph "Thread Grid"
+ T00["Thread[0,0] Warp 0"]
+ T01["Thread[0,1] Warp 0"]
+ T10["Thread[1,0] Warp 1"]
+ T11["Thread[1,1] Warp 1"]
+ end
+
+ subgraph "Data Tiles"
+ D00["Data[0:4, 0:4] 16 elements"]
+ D01["Data[0:4, 4:8] 16 elements"]
+ D10["Data[4:8, 0:4] 16 elements"]
+ D11["Data[4:8, 4:8] 16 elements"]
+ end
+
+ subgraph "Memory Access"
+ MA["Coalesced Access Adjacent threads → Adjacent memory"]
+ end
+ end
+
+ T00 --> D00
+ T01 --> D01
+ T10 --> D10
+ T11 --> D11
+
+ D00 --> MA
+ D01 --> MA
+
+ style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+
+
+.. image:: diagrams/thread_mapping_2.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/thread_mapping_2.svg
:alt: Diagram
:align: center
diff --git a/docs/conceptual/ck_tile/tile_distribution.rst b/docs/conceptual/ck_tile/tile_distribution.rst
index db7eb7a466..e36b3d60be 100644
--- a/docs/conceptual/ck_tile/tile_distribution.rst
+++ b/docs/conceptual/ck_tile/tile_distribution.rst
@@ -19,79 +19,103 @@ The elegance of this design lies in its ability to adapt to diverse computationa
Complete Tile Distribution System Overview
------------------------------------------
-.. raw:: html
-
-
- graph TB
- subgraph "Logical View"
- T["Tensor Multi-dimensional data"]
- TD["TileDistribution Work assignment"]
- TW["TileWindow Data view"]
- end
-
- subgraph "Coordinate Spaces"
- X["X: Physical tensor coords"]
- Y["Y: Tile pattern coords"]
- P["P: Processing element coords"]
- R["R: Replication coords (optional)"]
- end
-
- subgraph "GPU Execution"
- W["Warps 32 threads each"]
- L["Lanes Thread within warp"]
- REG["Registers Thread-local storage"]
- end
-
- T --> TD
- TD --> TW
-
- TD --> X
- TD --> Y
- TD --> P
- TD --> R
-
- P --> W
- P --> L
- TW --> REG
-
- style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Logical View"
+ T["Tensor Multi-dimensional data"]
+ TD["TileDistribution Work assignment"]
+ TW["TileWindow Data view"]
+ end
+
+ subgraph "Coordinate Spaces"
+ X["X: Physical tensor coords"]
+ Y["Y: Tile pattern coords"]
+ P["P: Processing element coords"]
+ R["R: Replication coords (optional)"]
+ end
+
+ subgraph "GPU Execution"
+ W["Warps 32 threads each"]
+ L["Lanes Thread within warp"]
+ REG["Registers Thread-local storage"]
+ end
+
+ T --> TD
+ TD --> TW
+
+ TD --> X
+ TD --> Y
+ TD --> P
+ TD --> R
+
+ P --> W
+ P --> L
+ TW --> REG
+
+ style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tile_distribution_1.svg
+ :alt: Diagram
+ :align: center
Coordinate System Architecture
------------------------------
-.. raw:: html
-
-
- flowchart LR
- subgraph "Input"
- TC["Thread Coordinates (warpId, laneId)"]
- end
-
- subgraph "Transformation Pipeline"
- P2Y["P → Y Thread to pattern"]
- Y2X["Y → X Pattern to physical"]
- Y2D["Y → D Pattern to register"]
- end
-
- subgraph "Output"
- MC["Memory Coordinates Global addresses"]
- RI["Register Indices Local storage"]
- end
-
- TC --> P2Y
- P2Y --> Y2X
- P2Y --> Y2D
- Y2X --> MC
- Y2D --> RI
-
- style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
- style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px
- style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ flowchart LR
+ subgraph "Input"
+ TC["Thread Coordinates (warpId, laneId)"]
+ end
+
+ subgraph "Transformation Pipeline"
+ P2Y["P → Y Thread to pattern"]
+ Y2X["Y → X Pattern to physical"]
+ Y2D["Y → D Pattern to register"]
+ end
+
+ subgraph "Output"
+ MC["Memory Coordinates Global addresses"]
+ RI["Register Indices Local storage"]
+ end
+
+ TC --> P2Y
+ P2Y --> Y2X
+ P2Y --> Y2D
+ Y2X --> MC
+ Y2D --> RI
+
+ style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+ style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tile_distribution_2.svg
+ :alt: Diagram
+ :align: center
What is Tile Distribution?
--------------------------
@@ -132,41 +156,53 @@ The key insight that makes TileDistribution effective is its ability to abstract
Problem Space Mapping
---------------------
-.. raw:: html
-
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+
+ graph TB
+ subgraph "Problem Space (256×256 Matrix)"
+ M["Full Matrix 65,536 elements"]
+ T1["Tile 1 32×32"]
+ T2["Tile 2 32×32"]
+ TN["Tile N 32×32"]
+ end
+
+ subgraph "Thread Assignment"
+ W0["Warp 0 32 threads"]
+ W1["Warp 1 32 threads"]
+ L0["Lane 0-31 Individual threads"]
+ end
+
+ subgraph "Memory Pattern"
+ MP["Coalesced Access Sequential addresses No bank conflicts"]
+ end
+
+ M --> T1
+ M --> T2
+ M --> TN
+
+ T1 --> W0
+ T1 --> W1
+ W0 --> L0
+ L0 --> MP
+
+ style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+
+
- graph TB
- subgraph "Problem Space (256×256 Matrix)"
- M["Full Matrix 65,536 elements"]
- T1["Tile 1 32×32"]
- T2["Tile 2 32×32"]
- TN["Tile N 32×32"]
- end
-
- subgraph "Thread Assignment"
- W0["Warp 0 32 threads"]
- W1["Warp 1 32 threads"]
- L0["Lane 0-31 Individual threads"]
- end
-
- subgraph "Memory Pattern"
- MP["Coalesced Access Sequential addresses No bank conflicts"]
- end
-
- M --> T1
- M --> T2
- M --> TN
-
- T1 --> W0
- T1 --> W1
- W0 --> L0
- L0 --> MP
-
- style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px
- style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px
-
+.. image:: diagrams/tile_distribution_3.svg
+ :alt: Diagram
+ :align: center
Creating a TileDistribution
---------------------------
@@ -339,38 +375,50 @@ Let's see how to create and use a TileDistribution in practice:
Hierarchical Decomposition
--------------------------
-.. raw:: html
-
-
- graph TB
- subgraph "Level 1: Block Distribution"
- B["Thread Block 256 threads"]
- BT1["Block Tile 1 64×64"]
- BT2["Block Tile 2 64×64"]
- end
-
- subgraph "Level 2: Warp Distribution"
- W["Warp 32 threads"]
- WT1["Warp Tile 1 16×16"]
- WT2["Warp Tile 2 16×16"]
- end
-
- subgraph "Level 3: Thread Distribution"
- T["Thread"]
- TT["Thread Tile 2×2"]
- end
-
- B --> BT1
- BT1 --> W
- W --> WT1
- WT1 --> T
- T --> TT
-
- style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px
- style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Level 1: Block Distribution"
+ B["Thread Block 256 threads"]
+ BT1["Block Tile 1 64×64"]
+ BT2["Block Tile 2 64×64"]
+ end
+
+ subgraph "Level 2: Warp Distribution"
+ W["Warp 32 threads"]
+ WT1["Warp Tile 1 16×16"]
+ WT2["Warp Tile 2 16×16"]
+ end
+
+ subgraph "Level 3: Thread Distribution"
+ T["Thread"]
+ TT["Thread Tile 2×2"]
+ end
+
+ B --> BT1
+ BT1 --> W
+ W --> WT1
+ WT1 --> T
+ T --> TT
+
+ style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px
+ style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tile_distribution_4.svg
+ :alt: Diagram
+ :align: center
Advanced Example: Matrix Multiplication Distribution
----------------------------------------------------
@@ -422,36 +470,48 @@ Advanced Example: Matrix Multiplication Distribution
Work Distribution Pattern
-------------------------
-.. raw:: html
-
-
- flowchart TB
- subgraph "Matrix C (128×128)"
- C["16,384 elements"]
- end
-
- subgraph "Thread Grid (32×32)"
- TG["1,024 threads"]
- end
-
- subgraph "Per Thread"
- PT["4×4 tile 16 elements"]
- end
-
- subgraph "Memory Access"
- MA["Coalesced reads Efficient writes No conflicts"]
- end
-
- C --> TG
- TG --> PT
- PT --> MA
-
- style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px
- style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
- style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
- style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ flowchart TB
+ subgraph "Matrix C (128×128)"
+ C["16,384 elements"]
+ end
+
+ subgraph "Thread Grid (32×32)"
+ TG["1,024 threads"]
+ end
+
+ subgraph "Per Thread"
+ PT["4×4 tile 16 elements"]
+ end
+
+ subgraph "Memory Access"
+ MA["Coalesced reads Efficient writes No conflicts"]
+ end
+
+ C --> TG
+ TG --> PT
+ PT --> MA
+
+ style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+ style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tile_distribution_5.svg
+ :alt: Diagram
+ :align: center
Memory Access Patterns
----------------------
@@ -464,79 +524,103 @@ One of the key benefits of TileDistribution is generating optimal memory access
Transformation Pipeline
-----------------------
-.. raw:: html
-
-
- graph LR
- subgraph "Input"
- TID["Thread ID (0-1023)"]
- end
-
- subgraph "Stage 1"
- P["P-coordinates (warp, lane)"]
- end
-
- subgraph "Stage 2"
- Y["Y-coordinates (tile position)"]
- end
-
- subgraph "Stage 3"
- X["X-coordinates (tensor indices)"]
- end
-
- subgraph "Output"
- ADDR["Memory addresses Register indices"]
- end
-
- TID --> P
- P --> Y
- Y --> X
- X --> ADDR
-
- style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
- style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph LR
+ subgraph "Input"
+ TID["Thread ID (0-1023)"]
+ end
+
+ subgraph "Stage 1"
+ P["P-coordinates (warp, lane)"]
+ end
+
+ subgraph "Stage 2"
+ Y["Y-coordinates (tile position)"]
+ end
+
+ subgraph "Stage 3"
+ X["X-coordinates (tensor indices)"]
+ end
+
+ subgraph "Output"
+ ADDR["Memory addresses Register indices"]
+ end
+
+ TID --> P
+ P --> Y
+ Y --> X
+ X --> ADDR
+
+ style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
+ style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tile_distribution_6.svg
+ :alt: Diagram
+ :align: center
Performance Comparison
----------------------
-.. raw:: html
-
-
- graph TB
- subgraph "Manual Implementation"
- M1["Calculate indices manually"]
- M2["Handle boundary conditions"]
- M3["Ensure coalescing"]
- M4["Manage bank conflicts"]
- M5["~200 lines of code"]
- end
-
- subgraph "With TileDistribution"
- T1["make_tile_distribution()"]
- T2["Automatic optimization"]
- T3["~10 lines of code"]
- end
-
- subgraph "Performance"
- P1["Same performance"]
- P2["Fewer bugs"]
- P3["Portable across GPUs"]
- end
-
- M1 --> M5
- T1 --> T3
-
- M5 --> P1
- T3 --> P1
- P1 --> P2
- P2 --> P3
-
- style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
- style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
- style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
-
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
+
+ .. mermaid::
+
+ graph TB
+ subgraph "Manual Implementation"
+ M1["Calculate indices manually"]
+ M2["Handle boundary conditions"]
+ M3["Ensure coalescing"]
+ M4["Manage bank conflicts"]
+ M5["~200 lines of code"]
+ end
+
+ subgraph "With TileDistribution"
+ T1["make_tile_distribution()"]
+ T2["Automatic optimization"]
+ T3["~10 lines of code"]
+ end
+
+ subgraph "Performance"
+ P1["Same performance"]
+ P2["Fewer bugs"]
+ P3["Portable across GPUs"]
+ end
+
+ M1 --> M5
+ T1 --> T3
+
+ M5 --> P1
+ T3 --> P1
+ P1 --> P2
+ P2 --> P3
+
+ style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
+ style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
+ style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
+
+
+
+
+
+.. image:: diagrams/tile_distribution_7.svg
+ :alt: Diagram
+ :align: center
Summary
-------
diff --git a/docs/conceptual/ck_tile/transforms.rst b/docs/conceptual/ck_tile/transforms.rst
index 7c9e9a21b8..1ad6fdec78 100644
--- a/docs/conceptual/ck_tile/transforms.rst
+++ b/docs/conceptual/ck_tile/transforms.rst
@@ -32,28 +32,36 @@ Zero-Copy Logical Operations
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "Tensor Coordinate Transformation"
- US["Lower Dimension Space Source coordinate system"]
- LS["Upper Dimension Space Target coordinate system"]
-
- DATA["Linear Data in Memory Layout determined by tensor shape & strides"]
- end
-
- US -->|"Forward Transform"| LS
- LS -->|"Inverse Transform"| US
-
- DATA -.->|"Same data, different views"| US
- DATA -.->|"Same data, different views"| LS
-
- style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "Tensor Coordinate Transformation"
+ US["Lower Dimension Space Source coordinate system"]
+ LS["Upper Dimension Space Target coordinate system"]
+
+ DATA["Linear Data in Memory Layout determined by tensor shape & strides"]
+ end
+
+ US -->|"Forward Transform"| LS
+ LS -->|"Inverse Transform"| US
+
+ DATA -.->|"Same data, different views"| US
+ DATA -.->|"Same data, different views"| LS
+
+ style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_1.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_1.svg
:alt: Diagram
:align: center
@@ -74,38 +82,46 @@ Transform System Architecture
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
-
- subgraph "Transform Types"
- EMB["EmbedTransform Linear → Multi-D Strided"]
- UNM["MergeTransform Multi-D → Linear"]
- MRG["UnmergeTransform Linear → Multi-D"]
- REP["ReplicateTransform 0D → Multi-D Broadcast"]
- OFF["OffsetTransform Translation"]
- PAS["PassThroughTransform Identity"]
- PAD["PadTransform Boundaries"]
- end
-
- subgraph "Operations"
- FWD["Forward calculate_lower_index()"]
- BWD["Backward calculate_upper_index()"]
- UPD["Update update_lower_index()"]
- end
-
- EMB --> FWD
- UNM --> FWD
- MRG --> FWD
- REP --> FWD
- OFF --> FWD
- PAS --> FWD
- PAD --> FWD
-
- style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+ .. mermaid::
+
+ graph TB
+
+ subgraph "Transform Types"
+ EMB["EmbedTransform Linear → Multi-D Strided"]
+ UNM["MergeTransform Multi-D → Linear"]
+ MRG["UnmergeTransform Linear → Multi-D"]
+ REP["ReplicateTransform 0D → Multi-D Broadcast"]
+ OFF["OffsetTransform Translation"]
+ PAS["PassThroughTransform Identity"]
+ PAD["PadTransform Boundaries"]
+ end
+
+ subgraph "Operations"
+ FWD["Forward calculate_lower_index()"]
+ BWD["Backward calculate_upper_index()"]
+ UPD["Update update_lower_index()"]
+ end
+
+ EMB --> FWD
+ UNM --> FWD
+ MRG --> FWD
+ REP --> FWD
+ OFF --> FWD
+ PAS --> FWD
+ PAD --> FWD
+
+ style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
+
+
+.. image:: diagrams/transforms_2.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_2.svg
:alt: Diagram
:align: center
@@ -117,28 +133,36 @@ MergeTransform collapses multiple dimensions from the lower coordinate space int
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "MergeTransform: Multi-D → Linear"
- LS["Lower Coordinate Space 2D: [4, 5] Coord: (2, 3)"]
- US["Upper Coordinate Space 1D Linear Index: 13"]
-
- DATA["Same Tensor Data Layout: row-major Size: 20 elements"]
- end
-
- LS -->|"Forward Transform 2×5 + 3 = 13"| US
- US -->|"Inverse Transform 13÷5=2, 13%5=3"| LS
-
- DATA -.->|"Multi-dimensional view"| LS
- DATA -.->|"Linear view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "MergeTransform: Multi-D → Linear"
+ LS["Lower Coordinate Space 2D: [4, 5] Coord: (2, 3)"]
+ US["Upper Coordinate Space 1D Linear Index: 13"]
+
+ DATA["Same Tensor Data Layout: row-major Size: 20 elements"]
+ end
+
+ LS -->|"Forward Transform 2×5 + 3 = 13"| US
+ US -->|"Inverse Transform 13÷5=2, 13%5=3"| LS
+
+ DATA -.->|"Multi-dimensional view"| LS
+ DATA -.->|"Linear view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_3.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_3.svg
:alt: Diagram
:align: center
@@ -180,28 +204,36 @@ UnmergeTransform expands coordinates from a single dimension in the lower coordi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "UnmergeTransform: Linear → Multi-D"
- LS["Lower Coordinate Space 1D Linear Index: 14"]
- US["Upper Coordinate Space 3D: [3, 4, 2] Coord: (1, 3, 0)"]
-
- DATA["Same Tensor Data Layout: row-major Size: 24 elements"]
- end
-
- LS -->|"Forward Transform 14 = 1×8 + 3×2 + 0"| US
- US -->|"Inverse Transform linearize back"| LS
-
- DATA -.->|"Linear view"| LS
- DATA -.->|"Multi-dimensional view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "UnmergeTransform: Linear → Multi-D"
+ LS["Lower Coordinate Space 1D Linear Index: 14"]
+ US["Upper Coordinate Space 3D: [3, 4, 2] Coord: (1, 3, 0)"]
+
+ DATA["Same Tensor Data Layout: row-major Size: 24 elements"]
+ end
+
+ LS -->|"Forward Transform 14 = 1×8 + 3×2 + 0"| US
+ US -->|"Inverse Transform linearize back"| LS
+
+ DATA -.->|"Linear view"| LS
+ DATA -.->|"Multi-dimensional view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_4.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_4.svg
:alt: Diagram
:align: center
@@ -253,28 +285,36 @@ EmbedTransform expands linear indices from the lower coordinate space into multi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "EmbedTransform: Linear → Multi-D Strided"
- LS["Lower Coordinate Space 1D Linear Index: 14"]
- US["Upper Coordinate Space 2D: [2, 3] Coord: (1, 2)"]
-
- DATA["Linear Buffer in Memory"]
- end
-
- LS -->|"Forward Transform Strides: [12, 1] 14 ÷ 12 = 1, 14 % 12 = 2"| US
- US -->|"Inverse Transform 1×12 + 2×1 = 14"| LS
-
- DATA -.->|"Linear index view"| LS
- DATA -.->|"Multi-dimensional strided view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "EmbedTransform: Linear → Multi-D Strided"
+ LS["Lower Coordinate Space 1D Linear Index: 14"]
+ US["Upper Coordinate Space 2D: [2, 3] Coord: (1, 2)"]
+
+ DATA["Linear Buffer in Memory"]
+ end
+
+ LS -->|"Forward Transform Strides: [12, 1] 14 ÷ 12 = 1, 14 % 12 = 2"| US
+ US -->|"Inverse Transform 1×12 + 2×1 = 14"| LS
+
+ DATA -.->|"Linear index view"| LS
+ DATA -.->|"Multi-dimensional strided view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_5.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_5.svg
:alt: Diagram
:align: center
@@ -315,28 +355,36 @@ ReplicateTransform creates a higher-dimensional tensor by replicating (broadcast
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "ReplicateTransform: 0D → Multi-D Broadcasting"
- LS["Lower Coordinate Space 0D: Scalar Empty coordinate []"]
- US["Upper Coordinate Space 2D: [3, 4] All coords: (i, j)"]
-
- DATA["Single Scalar Value"]
- end
-
- LS -->|"Forward Transform [] → (i,j) for any i,j"| US
- US -->|"Inverse Transform (i,j) → [] for any i,j"| LS
-
- DATA -.->|"One scalar value"| LS
- DATA -.->|"Broadcasted view at all positions"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "ReplicateTransform: 0D → Multi-D Broadcasting"
+ LS["Lower Coordinate Space 0D: Scalar Empty coordinate []"]
+ US["Upper Coordinate Space 2D: [3, 4] All coords: (i, j)"]
+
+ DATA["Single Scalar Value"]
+ end
+
+ LS -->|"Forward Transform [] → (i,j) for any i,j"| US
+ US -->|"Inverse Transform (i,j) → [] for any i,j"| LS
+
+ DATA -.->|"One scalar value"| LS
+ DATA -.->|"Broadcasted view at all positions"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_6.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_6.svg
:alt: Diagram
:align: center
@@ -388,28 +436,36 @@ OffsetTransform shifts coordinates by a fixed offset, creating a translated view
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "OffsetTransform: 1D → 1D Translation"
- LS["Lower Coordinate Space 1D: [0, 63] Coord: index + offset"]
- US["Upper Coordinate Space 1D: [0, 47] Coord: index"]
-
- DATA["Linear Buffer in Memory"]
- end
-
- LS -->|"Forward Transform idx → idx + 16"| US
- US -->|"Inverse Transform idx + 16 → idx"| LS
-
- DATA -.->|"Lower view"| LS
- DATA -.->|"Upper view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "OffsetTransform: 1D → 1D Translation"
+ LS["Lower Coordinate Space 1D: [0, 63] Coord: index + offset"]
+ US["Upper Coordinate Space 1D: [0, 47] Coord: index"]
+
+ DATA["Linear Buffer in Memory"]
+ end
+
+ LS -->|"Forward Transform idx → idx + 16"| US
+ US -->|"Inverse Transform idx + 16 → idx"| LS
+
+ DATA -.->|"Lower view"| LS
+ DATA -.->|"Upper view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_7.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_7.svg
:alt: Diagram
:align: center
@@ -462,28 +518,36 @@ No-op transform that passes coordinates unchanged. The PassThrough transform is
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "PassThroughTransform: 1D → 1D Identity"
- LS["Lower Coordinate Space 1D: [0, 59] Coord: index"]
- US["Upper Coordinate Space 1D: [0, 59] Coord: index"]
-
- DATA["Linear Buffer in Memory"]
- end
-
- LS -.->|"Perfect Identity idx → idx"| US
- US -.->|"Perfect Identity idx → idx"| LS
-
- DATA -->|"Same buffer same view"| LS
- DATA -->|"Same buffer same view"| US
-
- style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
- style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "PassThroughTransform: 1D → 1D Identity"
+ LS["Lower Coordinate Space 1D: [0, 59] Coord: index"]
+ US["Upper Coordinate Space 1D: [0, 59] Coord: index"]
+
+ DATA["Linear Buffer in Memory"]
+ end
+
+ LS -.->|"Perfect Identity idx → idx"| US
+ US -.->|"Perfect Identity idx → idx"| LS
+
+ DATA -->|"Same buffer same view"| LS
+ DATA -->|"Same buffer same view"| US
+
+ style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
+ style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_8.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_8.svg
:alt: Diagram
:align: center
@@ -531,28 +595,36 @@ PadTransform adds padding to tensor dimensions, mapping coordinates from upper d
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "PadTransform: 1D → 1D with Padding"
- LS["Lower Coordinate Space 1D: [0, 2] (original data)"]
- US["Upper Coordinate Space 1D: [0, 4] (with padding)"]
-
- DATA["Tensor Data in Memory"]
- end
-
- LS -->|"Forward Transform idx + left_pad"| US
- US -->|"Inverse Transform idx - left_pad"| LS
-
- DATA -.->|"Original view"| LS
- DATA -.->|"Padded view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "PadTransform: 1D → 1D with Padding"
+ LS["Lower Coordinate Space 1D: [0, 2] (original data)"]
+ US["Upper Coordinate Space 1D: [0, 4] (with padding)"]
+
+ DATA["Tensor Data in Memory"]
+ end
+
+ LS -->|"Forward Transform idx + left_pad"| US
+ US -->|"Inverse Transform idx - left_pad"| LS
+
+ DATA -.->|"Original view"| LS
+ DATA -.->|"Padded view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_9.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_9.svg
:alt: Diagram
:align: center
@@ -603,28 +675,36 @@ XorTransform applies a 2D XOR mapping for specialized memory access patterns. It
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "XorTransform: 2D → 2D XOR Mapping"
- LS["Lower Coordinate Space 2D: [4, 8] XOR-transformed coords"]
- US["Upper Coordinate Space 2D: [4, 8] Normal coords"]
-
- DATA["Same Tensor Data"]
- end
-
- LS -->|"Forward Transform apply XOR reverse"| US
- US -->|"Inverse Transform apply XOR mapping"| LS
-
- DATA -.->|"XOR pattern view"| LS
- DATA -.->|"Normal view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "XorTransform: 2D → 2D XOR Mapping"
+ LS["Lower Coordinate Space 2D: [4, 8] XOR-transformed coords"]
+ US["Upper Coordinate Space 2D: [4, 8] Normal coords"]
+
+ DATA["Same Tensor Data"]
+ end
+
+ LS -->|"Forward Transform apply XOR reverse"| US
+ US -->|"Inverse Transform apply XOR mapping"| LS
+
+ DATA -.->|"XOR pattern view"| LS
+ DATA -.->|"Normal view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_10.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_10.svg
:alt: Diagram
:align: center
@@ -636,28 +716,36 @@ SliceTransform extracts a sub-region from a tensor dimension.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "SliceTransform: 1D → 1D Sub-region"
- LS["Lower Coordinate Space 1D: [0, 9] (original range)"]
- US["Upper Coordinate Space 1D: [0, 4] (slice range)"]
-
- DATA["Tensor Data in Memory"]
- end
-
- LS -->|"Forward Transform idx + slice_begin"| US
- US -->|"Inverse Transform idx - slice_begin"| LS
-
- DATA -.->|"Full tensor view"| LS
- DATA -.->|"Sub-region view"| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "SliceTransform: 1D → 1D Sub-region"
+ LS["Lower Coordinate Space 1D: [0, 9] (original range)"]
+ US["Upper Coordinate Space 1D: [0, 4] (slice range)"]
+
+ DATA["Tensor Data in Memory"]
+ end
+
+ LS -->|"Forward Transform idx + slice_begin"| US
+ US -->|"Inverse Transform idx - slice_begin"| LS
+
+ DATA -.->|"Full tensor view"| LS
+ DATA -.->|"Sub-region view"| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_11.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_11.svg
:alt: Diagram
:align: center
@@ -669,28 +757,36 @@ ModuloTransform applies cyclic wrapping to coordinates using modulo operations.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
- .. mermaid::
+..
+ Original mermaid diagram (edit here, then run update_diagrams.py)
- graph TB
- subgraph "ModuloTransform: 1D → 1D Cyclic"
- LS["Lower Coordinate Space 1D: [0, 3] (modulus range)"]
- US["Upper Coordinate Space 1D: [0, 15] (full range)"]
-
- DATA["Tensor Data in Memory"]
- end
-
- LS -->|"Forward Transform idx * cycle_count"| US
- US -->|"Inverse Transform idx % modulus"| LS
-
- DATA -.->|" "| LS
- DATA -.->|" "| US
-
- style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
- style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
- style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+ .. mermaid::
+
+ graph TB
+ subgraph "ModuloTransform: 1D → 1D Cyclic"
+ LS["Lower Coordinate Space 1D: [0, 3] (modulus range)"]
+ US["Upper Coordinate Space 1D: [0, 15] (full range)"]
+
+ DATA["Tensor Data in Memory"]
+ end
+
+ LS -->|"Forward Transform idx * cycle_count"| US
+ US -->|"Inverse Transform idx % modulus"| LS
+
+ DATA -.->|" "| LS
+ DATA -.->|" "| US
+
+ style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
+ style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+ style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
+
+
+.. image:: diagrams/transforms_12.svg
+ :alt: Diagram
+ :align: center
.. image:: diagrams/transforms_12.svg
:alt: Diagram
:align: center