diff --git a/docs/conceptual/ck_tile/adaptors.rst b/docs/conceptual/ck_tile/adaptors.rst index 137b1eb5f0..035f1b494c 100644 --- a/docs/conceptual/ck_tile/adaptors.rst +++ b/docs/conceptual/ck_tile/adaptors.rst @@ -18,35 +18,43 @@ A TensorAdaptor encapsulates a sequence of :ref:`coordinate transformations [0,1,2]"] - T1["Transform
(e.g., Transpose)"] - O1["Output Coords
[2,0,1]"] - I1 --> T1 --> O1 - end - - subgraph "Chained Transforms" - direction TB - I2["Input
2D"] - T2A["Transform A
(e.g., Merge)"] - M2["Intermediate
1D"] - T2B["Transform B
(e.g., Pad)"] - O2["Output
1D Padded"] - I2 --> T2A --> M2 --> T2B --> O2 - end - end - - style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px + .. mermaid:: + + graph LR + subgraph "Adaptor Composition" + subgraph "Single Transform" + direction TB + I1["Input Coords
[0,1,2]"] + T1["Transform
(e.g., Transpose)"] + O1["Output Coords
[2,0,1]"] + I1 --> T1 --> O1 + end + + subgraph "Chained Transforms" + direction TB + I2["Input
2D"] + T2A["Transform A
(e.g., Merge)"] + M2["Intermediate
1D"] + T2B["Transform B
(e.g., Pad)"] + O2["Output
1D Padded"] + I2 --> T2A --> M2 --> T2B --> O2 + end + end + + style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px + + +.. image:: diagrams/adaptors_1.svg + :alt: Diagram + :align: center .. image:: diagrams/adaptors_1.svg :alt: Diagram :align: center @@ -118,44 +126,52 @@ The real power of adaptors comes from chaining multiple transformations together .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph LR - subgraph "Adaptor Chaining Flow" - subgraph "Adaptor 1" - A1I["Bottom Dims
[0,1]"] - A1T["Transform:
Merge[2,3]"] - A1O["Top Dims
[0]"] - end - - subgraph "Adaptor 2" - A2I["Bottom Dims
[0]"] - A2T["Transform:
Unmerge[2,3]"] - A2O["Top Dims
[0,1]"] - end - - subgraph "Chained Result" - CI["Input 2D
Bottom[0,1]"] - CO["Output 2D
Top[0,1]"] - end - end - - A1I --> A1T - A1T --> A1O - A1O --> A2I - A2I --> A2T - A2T --> A2O - - CI --> A1I - A2O --> CO - - style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + .. mermaid:: + + graph LR + subgraph "Adaptor Chaining Flow" + subgraph "Adaptor 1" + A1I["Bottom Dims
[0,1]"] + A1T["Transform:
Merge[2,3]"] + A1O["Top Dims
[0]"] + end + + subgraph "Adaptor 2" + A2I["Bottom Dims
[0]"] + A2T["Transform:
Unmerge[2,3]"] + A2O["Top Dims
[0,1]"] + end + + subgraph "Chained Result" + CI["Input 2D
Bottom[0,1]"] + CO["Output 2D
Top[0,1]"] + end + end + + A1I --> A1T + A1T --> A1O + A1O --> A2I + A2I --> A2T + A2T --> A2O + + CI --> A1I + A2O --> CO + + style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + +.. image:: diagrams/adaptors_2.svg + :alt: Diagram + :align: center .. image:: diagrams/adaptors_2.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/buffer_views.rst b/docs/conceptual/ck_tile/buffer_views.rst index 5a80723914..d4a04e2df6 100644 --- a/docs/conceptual/ck_tile/buffer_views.rst +++ b/docs/conceptual/ck_tile/buffer_views.rst @@ -23,40 +23,52 @@ Memory coherence and caching policies represent another layer of complexity that Address Space Usage Patterns ---------------------------- -.. raw:: html - -
- flowchart TB - subgraph CF ["Compute Flow"] - direction LR - GM1["Global Memory
Input Data"] --> LDS["LDS
Tile Cache"] - LDS --> VGPR["VGPR
Working Set"] - VGPR --> Compute["Compute
Operations"] - Compute --> VGPR - VGPR --> LDS2["LDS
Reduction"] - LDS2 --> GM2["Global Memory
Output Data"] - end - - subgraph UP ["Usage Pattern"] - direction LR - P1["1. Load tile from Global → LDS"] - P2["2. Load working set LDS → VGPR"] - P3["3. Compute in VGPR"] - P4["4. Store results VGPR → LDS"] - P5["5. Reduce in LDS"] - P6["6. Write final LDS → Global"] - - P1 --> P2 --> P3 --> P4 --> P5 --> P6 - end - - CF ~~~ UP - - style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px - style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px - style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + flowchart TB + subgraph CF ["Compute Flow"] + direction LR + GM1["Global Memory
Input Data"] --> LDS["LDS
Tile Cache"] + LDS --> VGPR["VGPR
Working Set"] + VGPR --> Compute["Compute
Operations"] + Compute --> VGPR + VGPR --> LDS2["LDS
Reduction"] + LDS2 --> GM2["Global Memory
Output Data"] + end + + subgraph UP ["Usage Pattern"] + direction LR + P1["1. Load tile from Global → LDS"] + P2["2. Load working set LDS → VGPR"] + P3["3. Compute in VGPR"] + P4["4. Store results VGPR → LDS"] + P5["5. Reduce in LDS"] + P6["6. Write final LDS → Global"] + + P1 --> P2 --> P3 --> P4 --> P5 --> P6 + end + + CF ~~~ UP + + style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px + style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px + style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + + + + + +.. image:: diagrams/buffer_views_1.svg + :alt: Diagram + :align: center C++ Implementation ------------------ @@ -176,76 +188,100 @@ The implementation of vector access maintains the same parameter structure as sc Scalar vs Vectorized Memory Access ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- graph LR - subgraph "Scalar Access (4 instructions)" - S1["Load float[0]"] --> R1["Register 1"] - S2["Load float[1]"] --> R2["Register 2"] - S3["Load float[2]"] --> R3["Register 3"] - S4["Load float[3]"] --> R4["Register 4"] - end - - subgraph "Vectorized Access (1 instruction)" - V1["Load float4[0]"] --> VR["Vector Register
(4 floats)"] - end - - subgraph "Performance Impact" - Perf["4x fewer instructions
Better memory bandwidth
Reduced latency"] - end - - R1 & R2 & R3 & R4 --> Perf - VR --> Perf - - style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px - style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph LR + subgraph "Scalar Access (4 instructions)" + S1["Load float[0]"] --> R1["Register 1"] + S2["Load float[1]"] --> R2["Register 2"] + S3["Load float[2]"] --> R3["Register 3"] + S4["Load float[3]"] --> R4["Register 4"] + end + + subgraph "Vectorized Access (1 instruction)" + V1["Load float4[0]"] --> VR["Vector Register
(4 floats)"] + end + + subgraph "Performance Impact" + Perf["4x fewer instructions
Better memory bandwidth
Reduced latency"] + end + + R1 & R2 & R3 & R4 --> Perf + VR --> Perf + + style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px + style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + + + + + +.. image:: diagrams/buffer_views_2.svg + :alt: Diagram + :align: center Understanding BufferView Indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- flowchart LR - subgraph "Input Parameters" - Offset["Offset
(e.g., 5)"] - ValidFlag["Valid Flag
(optional)"] - end - - subgraph "Processing" - BoundsCheck{{"Bounds Check
offset < buffer_size?"}} - FlagCheck{{"Flag Check
valid_flag == True?"}} - Access["Access Memory
buffer[offset]"] - end - - subgraph "Output" - ValidResult["Valid Result
Return value"] - Invalid["Invalid Result
Return 0 or default"] - end - - Offset --> BoundsCheck - ValidFlag --> FlagCheck - - BoundsCheck -->|Yes| FlagCheck - BoundsCheck -->|No| Invalid - - FlagCheck -->|Yes| Access - FlagCheck -->|No| Invalid - - Access --> ValidResult - - style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px - style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px - style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px - style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + flowchart LR + subgraph "Input Parameters" + Offset["Offset
(e.g., 5)"] + ValidFlag["Valid Flag
(optional)"] + end + + subgraph "Processing" + BoundsCheck{{"Bounds Check
offset < buffer_size?"}} + FlagCheck{{"Flag Check
valid_flag == True?"}} + Access["Access Memory
buffer[offset]"] + end + + subgraph "Output" + ValidResult["Valid Result
Return value"] + Invalid["Invalid Result
Return 0 or default"] + end + + Offset --> BoundsCheck + ValidFlag --> FlagCheck + + BoundsCheck -->|Yes| FlagCheck + BoundsCheck -->|No| Invalid + + FlagCheck -->|Yes| Access + FlagCheck -->|No| Invalid + + Access --> ValidResult + + style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px + style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px + + + + + +.. image:: diagrams/buffer_views_3.svg + :alt: Diagram + :align: center C++ Get Operations ~~~~~~~~~~~~~~~~~~ @@ -341,28 +377,40 @@ Atomic Operations Atomic vs Non-Atomic Operations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- graph TB - subgraph "Non-Atomic Operation (Race Condition)" - NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"] - NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"] - NA2 --> NA5["Thread 1: Write 15"] - NA4 --> NA6["Thread 2: Write 13"] - NA5 & NA6 --> NA7["Final value: 13 ❌
(Lost update from Thread 1)"] - end - - subgraph "Atomic Operation (Thread-Safe)" - A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures
serialization"] - A3["Thread 2: atomic_add(3)"] --> A2 - A2 --> A4["Final value: 18 ✓
(Both updates applied)"] - end - - style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Non-Atomic Operation (Race Condition)" + NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"] + NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"] + NA2 --> NA5["Thread 1: Write 15"] + NA4 --> NA6["Thread 2: Write 13"] + NA5 & NA6 --> NA7["Final value: 13 ❌
(Lost update from Thread 1)"] + end + + subgraph "Atomic Operation (Thread-Safe)" + A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures
serialization"] + A3["Thread 2: atomic_add(3)"] --> A2 + A2 --> A4["Final value: 18 ✓
(Both updates applied)"] + end + + style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px + + + + + +.. image:: diagrams/buffer_views_4.svg + :alt: Diagram + :align: center C++ Atomic Operations ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/conceptual/ck_tile/convert_mermaid_to_svg.py b/docs/conceptual/ck_tile/convert_mermaid_to_svg.py index 3a09f54097..a6cdadd4e4 100644 --- a/docs/conceptual/ck_tile/convert_mermaid_to_svg.py +++ b/docs/conceptual/ck_tile/convert_mermaid_to_svg.py @@ -31,11 +31,16 @@ RST_FILES = [ 'descriptors.rst', 'coordinate_movement.rst', 'adaptors.rst', + 'introduction_motivation.rst', + 'buffer_views.rst', + 'tensor_views.rst', + 'coordinate_systems.rst', + 'tile_distribution.rst', ] -# Pattern to find mermaid blocks +# Pattern to find mermaid blocks (can be indented with 3 spaces for commented blocks) MERMAID_PATTERN = re.compile( - r'^\.\. mermaid::\s*\n((?:(?:\n| .*))*)', + r'^(?: )?\.\. mermaid::\s*\n((?:(?:\n| .*))*)', re.MULTILINE ) diff --git a/docs/conceptual/ck_tile/convert_raw_html_to_commented.py b/docs/conceptual/ck_tile/convert_raw_html_to_commented.py new file mode 100644 index 0000000000..3cba55926d --- /dev/null +++ b/docs/conceptual/ck_tile/convert_raw_html_to_commented.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""Convert raw HTML mermaid blocks to commented format for SVG conversion.""" + +import os +import re + +def convert_raw_html_to_commented(content): + """Convert raw HTML mermaid blocks to commented mermaid format.""" + + # Pattern to match raw HTML mermaid blocks + pattern = r'\.\. raw:: html\n\n
]*>\n(.*?)\n
' + + def replace_block(match): + mermaid_code = match.group(1) + # The mermaid code in HTML has 3-space indentation, keep it + # but add 3 more spaces for .. mermaid:: indentation + mermaid_lines = mermaid_code.split('\n') + properly_indented = [] + for line in mermaid_lines: + if line.strip(): # Non-empty line + # Line already has 3 spaces from HTML, add 3 more for mermaid block + properly_indented.append(' ' + line) + else: + properly_indented.append('') + + indented_code = '\n'.join(properly_indented) + + # Create commented format matching the expected pattern + commented = f""".. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + +{indented_code} + + +""" + return commented + + return re.sub(pattern, replace_block, content, flags=re.DOTALL) + +def main(): + """Process files with raw HTML mermaid blocks.""" + + files_to_convert = [ + 'introduction_motivation.rst', + 'buffer_views.rst', + 'tensor_views.rst', + 'coordinate_systems.rst', + 'tile_distribution.rst' + ] + + converted_files = [] + + for filename in files_to_convert: + if not os.path.exists(filename): + print(f'Skipping {filename} - not found') + continue + + with open(filename, 'r', encoding='utf-8') as f: + original = f.read() + + converted = convert_raw_html_to_commented(original) + + if converted != original: + with open(filename, 'w', encoding='utf-8') as f: + f.write(converted) + + blocks_converted = original.count('.. raw:: html') + converted_files.append((filename, blocks_converted)) + print(f'✓ Converted {filename}: {blocks_converted} blocks') + else: + print(f' {filename}: no raw HTML blocks found') + + print(f'\n=== CONVERSION COMPLETE ===') + print(f'Files converted: {len(converted_files)}') + print(f'Total blocks: {sum(c for _, c in converted_files)}') + print('\nNext: Run convert_mermaid_to_svg.py to generate SVG files') + +if __name__ == '__main__': + main() diff --git a/docs/conceptual/ck_tile/convolution_example.rst b/docs/conceptual/ck_tile/convolution_example.rst index 200efcd74e..15c6bce874 100644 --- a/docs/conceptual/ck_tile/convolution_example.rst +++ b/docs/conceptual/ck_tile/convolution_example.rst @@ -18,42 +18,50 @@ The key insight is that convolution can be transformed from a complex nested loo .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "Convolution Process" - I["Input Image
6×6"] - K["Kernel
3×3"] - SW["Sliding Window
Extract 3×3 patches"] - DP["Dot Product
Element-wise multiply & sum"] - O["Output
4×4"] - end - - subgraph "Im2col Optimization" - W["Windows Matrix
16×9
(all patches)"] - KF["Kernel Flattened
9×1"] - MM["Matrix Multiply
W @ K"] - OF["Output Flattened
16×1"] - end - - I --> SW - K --> DP - SW --> DP - DP --> O - - SW --> W - K --> KF - W --> MM - KF --> MM - MM --> OF - OF --> O - - style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px + .. mermaid:: + + graph TB + subgraph "Convolution Process" + I["Input Image
6×6"] + K["Kernel
3×3"] + SW["Sliding Window
Extract 3×3 patches"] + DP["Dot Product
Element-wise multiply & sum"] + O["Output
4×4"] + end + + subgraph "Im2col Optimization" + W["Windows Matrix
16×9
(all patches)"] + KF["Kernel Flattened
9×1"] + MM["Matrix Multiply
W @ K"] + OF["Output Flattened
16×1"] + end + + I --> SW + K --> DP + SW --> DP + DP --> O + + SW --> W + K --> KF + W --> MM + KF --> MM + MM --> OF + OF --> O + + style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px + + +.. image:: diagrams/convolution_example.svg + :alt: Diagram + :align: center .. image:: diagrams/convolution_example.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/coordinate_movement.rst b/docs/conceptual/ck_tile/coordinate_movement.rst index 860d585a21..6915842139 100644 --- a/docs/conceptual/ck_tile/coordinate_movement.rst +++ b/docs/conceptual/ck_tile/coordinate_movement.rst @@ -20,36 +20,44 @@ For the mathematical foundations of coordinate systems, see :ref:`ck_tile_coordi .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: - - graph TB - subgraph "Coordinate Movement System" - TC["TensorCoordinate
Position + Descriptor Context"] - TAC["TensorAdaptorCoordinate
Position + Transform Context"] - MC["move_coordinate()
Efficient Navigation"] - end - - subgraph "Movement Example" - S["Start: [1,1]
Offset: 5"] - M1["Move [0,1]
→ [1,2]
Offset: 6"] - M2["Move [1,0]
→ [2,2]
Offset: 10"] - M3["Move [1,1]
→ [3,3]
Offset: 15"] - end - - TC --> MC - TAC --> MC - - S --> M1 - M1 --> M2 - M2 --> M3 - - style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + .. mermaid:: + + graph TB + subgraph "Coordinate Movement System" + TC["TensorCoordinate
Position + Descriptor Context"] + TAC["TensorAdaptorCoordinate
Position + Transform Context"] + MC["move_coordinate()
Efficient Navigation"] + end + + subgraph "Movement Example" + S["Start: [1,1]
Offset: 5"] + M1["Move [0,1]
→ [1,2]
Offset: 6"] + M2["Move [1,0]
→ [2,2]
Offset: 10"] + M3["Move [1,1]
→ [3,3]
Offset: 15"] + end + + TC --> MC + TAC --> MC + + S --> M1 + M1 --> M2 + M2 --> M3 + + style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + + +.. image:: diagrams/coordinate_movement.svg + :alt: Diagram + :align: center .. image:: diagrams/coordinate_movement.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/coordinate_systems.rst b/docs/conceptual/ck_tile/coordinate_systems.rst index 551f766321..0763d83c46 100644 --- a/docs/conceptual/ck_tile/coordinate_systems.rst +++ b/docs/conceptual/ck_tile/coordinate_systems.rst @@ -15,39 +15,51 @@ The Five Coordinate Spaces The CK framework employs five interconnected coordinate spaces, each serving a specific purpose in the journey from thread identification to memory access. These spaces work together to solve the fundamental challenge of GPU programming: efficiently distributing work across thousands of parallel threads while maintaining optimal memory access patterns. -.. raw:: html - -
- graph TB - subgraph "Coordinate Spaces Overview" - P["P-space
Thread Identification
Which thread am I?"] - Y["Y-space
Logical Tile
Which element in my tile?"] - X["X-space
Physical Tensor
Where in the tensor?"] - R["R-space
Replication
Data sharing pattern"] - D["D-space
Linear Storage
Memory address"] - end - - subgraph "Transformations" - T1["P + Y → X
Thread + Element → Position"] - T2["X → D
Position → Address"] - end - - P --> T1 - Y --> T1 - T1 --> X - X --> T2 - T2 --> D - - R -.-> P - R -.-> Y - - style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px - style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Coordinate Spaces Overview" + P["P-space
Thread Identification
Which thread am I?"] + Y["Y-space
Logical Tile
Which element in my tile?"] + X["X-space
Physical Tensor
Where in the tensor?"] + R["R-space
Replication
Data sharing pattern"] + D["D-space
Linear Storage
Memory address"] + end + + subgraph "Transformations" + T1["P + Y → X
Thread + Element → Position"] + T2["X → D
Position → Address"] + end + + P --> T1 + Y --> T1 + T1 --> X + X --> T2 + T2 --> D + + R -.-> P + R -.-> Y + + style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px + style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px + + + + + +.. image:: diagrams/coordinate_systems_1.svg + :alt: Diagram + :align: center The Challenge and Solution ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,41 +81,53 @@ P-space (Partition Space) represents the foundation of the coordinate system hie GPU Thread Hierarchy ~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- graph TB - subgraph "GPU Thread Hierarchy" - subgraph "Block" - subgraph "Warp 0" - T0["Thread 0
P=[0,0]"] - T1["Thread 1
P=[0,1]"] - T2["Thread 2
P=[0,2]"] - T31["..."] - T3["Thread 31
P=[0,31]"] - end - subgraph "Warp 1" - T32["Thread 32
P=[1,0]"] - T33["Thread 33
P=[1,1]"] - T34["..."] - T63["Thread 63
P=[1,31]"] - end - W2["Warp 2..."] - W7["Warp 7"] - end - end - - subgraph "P-space Mapping" - PM["P-coordinates = [warp_id, lane_id]
or
P-coordinates = [block_x, block_y, thread_x, thread_y]"] - end - - T0 --> PM - T32 --> PM - - style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "GPU Thread Hierarchy" + subgraph "Block" + subgraph "Warp 0" + T0["Thread 0
P=[0,0]"] + T1["Thread 1
P=[0,1]"] + T2["Thread 2
P=[0,2]"] + T31["..."] + T3["Thread 31
P=[0,31]"] + end + subgraph "Warp 1" + T32["Thread 32
P=[1,0]"] + T33["Thread 33
P=[1,1]"] + T34["..."] + T63["Thread 63
P=[1,31]"] + end + W2["Warp 2..."] + W7["Warp 7"] + end + end + + subgraph "P-space Mapping" + PM["P-coordinates = [warp_id, lane_id]
or
P-coordinates = [block_x, block_y, thread_x, thread_y]"] + end + + T0 --> PM + T32 --> PM + + style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + + + + + +.. image:: diagrams/coordinate_systems_2.svg + :alt: Diagram + :align: center The structure of P-space directly reflects the :ref:`hardware organization ` of modern GPUs. Each thread receives a unique P-coordinate that encodes its position within the execution hierarchy. For simple distributions, P-space might be one-dimensional, containing only a thread ID. For complex hierarchical distributions, P-space can have multiple dimensions representing different levels of the GPU's thread organization. C++ Implementation @@ -147,44 +171,56 @@ Y-space (Yield Space) represents the logical organization of work within each th Work Assignment Structure ~~~~~~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- graph TB - subgraph "Thread's Tile (2x2 elements)" - Y00["Y=[0,0]
Element 0"] - Y01["Y=[0,1]
Element 1"] - Y10["Y=[1,0]
Element 2"] - Y11["Y=[1,1]
Element 3"] - end - - subgraph "Y-space Structure" - YS["Each thread processes
the same Y-space pattern
but at different X locations"] - end - - subgraph "Example: 4 Threads" - T0["Thread 0
P=[0,0]"] - T1["Thread 1
P=[0,1]"] - T2["Thread 2
P=[1,0]"] - T3["Thread 3
P=[1,1]"] - end - - Y00 --> YS - Y01 --> YS - Y10 --> YS - Y11 --> YS - - T0 --> YS - T1 --> YS - T2 --> YS - T3 --> YS - - style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Thread's Tile (2x2 elements)" + Y00["Y=[0,0]
Element 0"] + Y01["Y=[0,1]
Element 1"] + Y10["Y=[1,0]
Element 2"] + Y11["Y=[1,1]
Element 3"] + end + + subgraph "Y-space Structure" + YS["Each thread processes
the same Y-space pattern
but at different X locations"] + end + + subgraph "Example: 4 Threads" + T0["Thread 0
P=[0,0]"] + T1["Thread 1
P=[0,1]"] + T2["Thread 2
P=[1,0]"] + T3["Thread 3
P=[1,1]"] + end + + Y00 --> YS + Y01 --> YS + Y10 --> YS + Y11 --> YS + + T0 --> YS + T1 --> YS + T2 --> YS + T3 --> YS + + style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px + + + + + +.. image:: diagrams/coordinate_systems_3.svg + :alt: Diagram + :align: center The power of Y-space lies in its ability to express different iteration patterns without changing the underlying distribution logic. A thread might traverse its Y-space in row-major order for one algorithm, column-major for another, or even use :ref:`space-filling curves ` for optimal cache utilization. This flexibility enables algorithm-specific optimizations while maintaining a consistent framework. Hierarchical Y-Space @@ -265,37 +301,49 @@ The transformation from P and Y coordinates to X coordinates represents the hear Transformation Pipeline ~~~~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- graph LR - subgraph "Input" - P["P-coordinates
Thread identity
P=[1,0]"] - Y["Y-coordinates
Element in tile
Y=[0,1]"] - end - - subgraph "Transformation" - T["P + Y → X
Base position + Offset"] - end - - subgraph "Output" - X["X-coordinates
Tensor position
X=[2,1]"] - end - - subgraph "Example" - E["Thread P=[1,0] at base (2,0)
Element Y=[0,1] adds offset (0,1)
Result X=[2,1] in tensor"] - end - - P --> T - Y --> T - T --> X - X --> E - - style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph LR + subgraph "Input" + P["P-coordinates
Thread identity
P=[1,0]"] + Y["Y-coordinates
Element in tile
Y=[0,1]"] + end + + subgraph "Transformation" + T["P + Y → X
Base position + Offset"] + end + + subgraph "Output" + X["X-coordinates
Tensor position
X=[2,1]"] + end + + subgraph "Example" + E["Thread P=[1,0] at base (2,0)
Element Y=[0,1] adds offset (0,1)
Result X=[2,1] in tensor"] + end + + P --> T + Y --> T + T --> X + X --> E + + style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + + + + +.. image:: diagrams/coordinate_systems_4.svg + :alt: Diagram + :align: center Mathematical Foundation ~~~~~~~~~~~~~~~~~~~~~~~ @@ -361,33 +409,45 @@ D-space represents the final transformation in the coordinate pipeline—convert Linearization Strategies ~~~~~~~~~~~~~~~~~~~~~~~~ -.. raw:: html - -
- graph LR - subgraph "X-coordinates" - X["X = [2, 3]
2D Position"] - end - - subgraph "Layout Options" - RM["Row-Major
D = 2×width + 3"] - CM["Column-Major
D = 3×height + 2"] - BL["Blocked
Complex pattern"] - end - - subgraph "D-coordinate" - D["D = 11
Linear Address"] - end - - X --> RM - X --> CM - X --> BL - RM --> D - - style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph LR + subgraph "X-coordinates" + X["X = [2, 3]
2D Position"] + end + + subgraph "Layout Options" + RM["Row-Major
D = 2×width + 3"] + CM["Column-Major
D = 3×height + 2"] + BL["Blocked
Complex pattern"] + end + + subgraph "D-coordinate" + D["D = 11
Linear Address"] + end + + X --> RM + X --> CM + X --> BL + RM --> D + + style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px + + + + + +.. image:: diagrams/coordinate_systems_5.svg + :alt: Diagram + :align: center The linearization process must consider multiple factors: .. code-block:: cpp @@ -417,48 +477,60 @@ Complete Pipeline Example Let's trace through a complete example showing how all coordinate spaces work together: -.. raw:: html - -
- graph TB - subgraph "Step 1: Thread Identification" - TID["Thread ID = 5"] - P["P-coordinates
P = [0, 5]
(warp 0, lane 5)"] - end - - subgraph "Step 2: Work Assignment" - Y["Y-coordinates
Y = [1, 0]
(element in tile)"] - end - - subgraph "Step 3: P+Y Transformation" - TRANS["P + Y → X
Thread position + Element offset"] - X["X-coordinates
X = [1, 5]
(tensor position)"] - end - - subgraph "Step 4: Linearization" - LIN["X → D
Row-major: D = x₀ × width + x₁"] - D["D-coordinate
D = 13
(memory address)"] - end - - subgraph "Step 5: Memory Access" - MEM["Hardware accesses
memory[13]"] - end - - TID --> P - P --> TRANS - Y --> TRANS - TRANS --> X - X --> LIN - LIN --> D - D --> MEM - - style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px - style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px - style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Step 1: Thread Identification" + TID["Thread ID = 5"] + P["P-coordinates
P = [0, 5]
(warp 0, lane 5)"] + end + + subgraph "Step 2: Work Assignment" + Y["Y-coordinates
Y = [1, 0]
(element in tile)"] + end + + subgraph "Step 3: P+Y Transformation" + TRANS["P + Y → X
Thread position + Element offset"] + X["X-coordinates
X = [1, 5]
(tensor position)"] + end + + subgraph "Step 4: Linearization" + LIN["X → D
Row-major: D = x₀ × width + x₁"] + D["D-coordinate
D = 13
(memory address)"] + end + + subgraph "Step 5: Memory Access" + MEM["Hardware accesses
memory[13]"] + end + + TID --> P + P --> TRANS + Y --> TRANS + TRANS --> X + X --> LIN + LIN --> D + D --> MEM + + style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px + style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px + style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px + + + + + +.. image:: diagrams/coordinate_systems_6.svg + :alt: Diagram + :align: center Real-World Example: Matrix Multiplication ----------------------------------------- diff --git a/docs/conceptual/ck_tile/descriptors.rst b/docs/conceptual/ck_tile/descriptors.rst index 9e980ca3dc..d385e86759 100644 --- a/docs/conceptual/ck_tile/descriptors.rst +++ b/docs/conceptual/ck_tile/descriptors.rst @@ -99,35 +99,43 @@ Every TensorDescriptor in CK Tile can be thought of as a **transformation pipeli .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph LR - subgraph "Pipeline Stages" - S1["Stage 1
Base Layout
[M, N]"] - S2["Stage 2
Transform
Unmerge"] - S3["Stage 3
New View
[M1, M2, N]"] - S4["Stage N
Final View
[...]"] - end - - subgraph "Same Data" - D["Physical Memory
No data movement"] - end - - S1 --> S2 - S2 --> S3 - S3 --> S4 - - S1 -.-> D - S2 -.-> D - S3 -.-> D - S4 -.-> D - - style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px - style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + .. mermaid:: + + graph LR + subgraph "Pipeline Stages" + S1["Stage 1
Base Layout
[M, N]"] + S2["Stage 2
Transform
Unmerge"] + S3["Stage 3
New View
[M1, M2, N]"] + S4["Stage N
Final View
[...]"] + end + + subgraph "Same Data" + D["Physical Memory
No data movement"] + end + + S1 --> S2 + S2 --> S3 + S3 --> S4 + + S1 -.-> D + S2 -.-> D + S3 -.-> D + S4 -.-> D + + style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px + style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + +.. image:: diagrams/descriptors_1.svg + :alt: Diagram + :align: center .. image:: diagrams/descriptors_1.svg :alt: Diagram :align: center @@ -197,40 +205,48 @@ Analysis of the Final Pipeline .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "Transform Pipeline" - T0["Transform 0
Base Unmerge
Input: [0]
Output: [1,2]"] - T1["Transform 1
PassThrough
Input: [1]
Output: [3]"] - T2["Transform 2
Unmerge
Input: [2]
Output: [4,5]"] - end - - subgraph "Hidden Dimensions" - H0["Hidden ID 0
Raw Buffer"] - H1["Hidden ID 1
Dim 0 (size 2)"] - H2["Hidden ID 2
Dim 1 (size 6)"] - H3["Hidden ID 3
Final Dim 0"] - H4["Hidden ID 4
Final Dim 1"] - H5["Hidden ID 5
Final Dim 2"] - end - - H0 --> T0 - T0 --> H1 - T0 --> H2 - H1 --> T1 - H2 --> T2 - T1 --> H3 - T2 --> H4 - T2 --> H5 - - style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px - style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + .. mermaid:: + + graph TB + subgraph "Transform Pipeline" + T0["Transform 0
Base Unmerge
Input: [0]
Output: [1,2]"] + T1["Transform 1
PassThrough
Input: [1]
Output: [3]"] + T2["Transform 2
Unmerge
Input: [2]
Output: [4,5]"] + end + + subgraph "Hidden Dimensions" + H0["Hidden ID 0
Raw Buffer"] + H1["Hidden ID 1
Dim 0 (size 2)"] + H2["Hidden ID 2
Dim 1 (size 6)"] + H3["Hidden ID 3
Final Dim 0"] + H4["Hidden ID 4
Final Dim 1"] + H5["Hidden ID 5
Final Dim 2"] + end + + H0 --> T0 + T0 --> H1 + T0 --> H2 + H1 --> T1 + H2 --> T2 + T1 --> H3 + T2 --> H4 + T2 --> H5 + + style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px + style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + +.. image:: diagrams/descriptors_2.svg + :alt: Diagram + :align: center .. image:: diagrams/descriptors_2.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg new file mode 100644 index 0000000000..fb696c9e42 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/buffer_views_1.svg @@ -0,0 +1 @@ +

Usage Pattern

1. Load tile from Global → LDS
2. Load working set LDS → VGPR
3. Compute in VGPR
4. Store results VGPR → LDS
5. Reduce in LDS
6. Write final LDS → Global

Compute Flow

Global Memory
Input Data

LDS
Tile Cache

VGPR
Working Set

Compute
Operations

LDS
Reduction

Global Memory
Output Data

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg new file mode 100644 index 0000000000..7a58311b33 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/buffer_views_2.svg @@ -0,0 +1 @@ +

Performance Impact

Vectorized Access (1 instruction)

Scalar Access (4 instructions)

Load float[0]

Register 1

Load float[1]

Register 2

Load float[2]

Register 3

Load float[3]

Register 4

Load float4[0]

Vector Register
(4 floats)

4x fewer instructions
Better memory bandwidth
Reduced latency

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg new file mode 100644 index 0000000000..8e20da9fa0 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/buffer_views_3.svg @@ -0,0 +1 @@ +

Output

Processing

Input Parameters

Yes

No

Yes

No

Offset
(e.g., 5)

Valid Flag
(optional)

Bounds Check
offset < buffer_size?

Flag Check
valid_flag == True?

Access Memory
buffer[offset]

Valid Result
Return value

Invalid Result
Return 0 or default

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg b/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg new file mode 100644 index 0000000000..f0b04d283b --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/buffer_views_4.svg @@ -0,0 +1 @@ +

Atomic Operation (Thread-Safe)

Thread 1: atomic_add(5)

Hardware ensures
serialization

Thread 2: atomic_add(3)

Final value: 18 ✓
(Both updates applied)

Non-Atomic Operation (Race Condition)

Thread 1: Read value (10)

Thread 1: Add 5 (15)

Thread 2: Read value (10)

Thread 2: Add 3 (13)

Thread 1: Write 15

Thread 2: Write 13

Final value: 13 ❌
(Lost update from Thread 1)

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/convolution_example.svg b/docs/conceptual/ck_tile/diagrams/convolution_example.svg index 1c36cf4f58..4a86641997 100644 --- a/docs/conceptual/ck_tile/diagrams/convolution_example.svg +++ b/docs/conceptual/ck_tile/diagrams/convolution_example.svg @@ -1 +1 @@ -

Im2col Optimization

Convolution Process

Input Image
6�6

Kernel
3�3

Sliding Window
Extract 3�3 patches

Dot Product
Element-wise multiply & sum

Output
4�4

Windows Matrix
16�9
(all patches)

Kernel Flattened
9�1

Matrix Multiply
W @ K

Output Flattened
16�1

\ No newline at end of file +

Im2col Optimization

Convolution Process

Input Image
6×6

Kernel
3×3

Sliding Window
Extract 3×3 patches

Dot Product
Element-wise multiply & sum

Output
4×4

Windows Matrix
16×9
(all patches)

Kernel Flattened
9×1

Matrix Multiply
W @ K

Output Flattened
16×1

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg new file mode 100644 index 0000000000..8890aa2362 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_1.svg @@ -0,0 +1 @@ +

Transformations

Coordinate Spaces Overview

P-space
Thread Identification
Which thread am I?

Y-space
Logical Tile
Which element in my tile?

X-space
Physical Tensor
Where in the tensor?

R-space
Replication
Data sharing pattern

D-space
Linear Storage
Memory address

P + Y → X
Thread + Element → Position

X → D
Position → Address

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg new file mode 100644 index 0000000000..765318910a --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_2.svg @@ -0,0 +1 @@ +

P-space Mapping

GPU Thread Hierarchy

Block

Warp 1

Warp 0

Thread 0
P=[0,0]

Thread 1
P=[0,1]

Thread 2
P=[0,2]

...

Thread 31
P=[0,31]

Thread 32
P=[1,0]

Thread 33
P=[1,1]

...

Thread 63
P=[1,31]

Warp 2...

Warp 7

P-coordinates = [warp_id, lane_id]
or
P-coordinates = [block_x, block_y, thread_x, thread_y]

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg new file mode 100644 index 0000000000..47846dfe4b --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_3.svg @@ -0,0 +1 @@ +

Example: 4 Threads

Y-space Structure

Thread's Tile (2x2 elements)

Y=[0,0]
Element 0

Y=[0,1]
Element 1

Y=[1,0]
Element 2

Y=[1,1]
Element 3

Each thread processes
the same Y-space pattern
but at different X locations

Thread 0
P=[0,0]

Thread 1
P=[0,1]

Thread 2
P=[1,0]

Thread 3
P=[1,1]

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg new file mode 100644 index 0000000000..3a9f04c73d --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_4.svg @@ -0,0 +1 @@ +

Example

Output

Transformation

Input

P-coordinates
Thread identity
P=[1,0]

Y-coordinates
Element in tile
Y=[0,1]

P + Y → X
Base position + Offset

X-coordinates
Tensor position
X=[2,1]

Thread P=[1,0] at base (2,0)
Element Y=[0,1] adds offset (0,1)
Result X=[2,1] in tensor

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg new file mode 100644 index 0000000000..f91d8b39ef --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_5.svg @@ -0,0 +1 @@ +

D-coordinate

Layout Options

X-coordinates

X = [2, 3]
2D Position

Row-Major
D = 2×width + 3

Column-Major
D = 3×height + 2

Blocked
Complex pattern

D = 11
Linear Address

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg b/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg new file mode 100644 index 0000000000..0e0275457a --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/coordinate_systems_6.svg @@ -0,0 +1 @@ +

Step 5: Memory Access

Step 4: Linearization

Step 3: P+Y Transformation

Step 2: Work Assignment

Step 1: Thread Identification

Thread ID = 5

P-coordinates
P = [0, 5]
(warp 0, lane 5)

Y-coordinates
Y = [1, 0]
(element in tile)

P + Y → X
Thread position + Element offset

X-coordinates
X = [1, 5]
(tensor position)

X → D
Row-major: D = x₀ × width + x₁

D-coordinate
D = 13
(memory address)

Hardware accesses
memory[13]

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg b/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg new file mode 100644 index 0000000000..55253de744 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_1.svg @@ -0,0 +1 @@ +

Tile Distribution Pattern (Efficient)

Memory_TD

Threads_TD

Mem[0]

Thread 0

Mem[1]

Thread 1

Mem[2]

Mem[3]

Thread 2

Mem[4]

Mem[5]

Thread 3

Mem[6]

Mem[7]

Random Access Pattern (Inefficient)

Memory

Threads

Mem[0]

Thread 0

Mem[23]

Thread 1

Mem[7]

Thread 2

Mem[47]

Thread 3

Mem[15]

Mem[31]

Mem[39]

Mem[55]

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg b/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg new file mode 100644 index 0000000000..524b6b2d40 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/introduction_motivation_2.svg @@ -0,0 +1 @@ +

Transformations

Coordinate Spaces

P-space
Thread Position
(thread_x, thread_y,
warp_id, block_id)

Y-space
Local Data
(y0, y1, y2, y3)

X-space
Global Position
(x0, x1)

D-space
Memory Address
(linearized)

P + Y → X
Thread data mapping

X → D
Memory linearization

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg new file mode 100644 index 0000000000..41338c8902 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tensor_views_1.svg @@ -0,0 +1 @@ +

Logical View

Tensor Layer

Access Layer

Memory Foundation

Flat Memory Array
0 1 2 3 4 5 6 7 8 9 10 11

BufferView
Linear Memory Access

TensorDescriptor
Shape & Stride Info

TensorView
Multi-dimensional Access

2D Matrix View
[3×4]
[[0,1,2,3]
[4,5,6,7]
[8,9,10,11]]

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg new file mode 100644 index 0000000000..f57636d293 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tensor_views_2.svg @@ -0,0 +1 @@ +

Result

TensorView Processing

User Input

Valid

Coordinate
(1, 2)

Shape Check
row < 3?
col < 4?

Apply Strides
offset = 1×4 + 2×1

BufferView Access
buffer[6]

Value: 6

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg new file mode 100644 index 0000000000..df13db0c0d --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tensor_views_3.svg @@ -0,0 +1 @@ +

Custom Stride (Transposed View)

Memory: [0,1,2,3,4,5,6,7,8,9,10,11]
Shape: (4,3)
Strides: (1,4)

[[0, 4, 8]
[1, 5, 9]
[2, 6, 10]
[3, 7, 11]]

Column-Major Layout (Fortran-style)

Memory: [0,3,6,9,1,4,7,10,2,5,8,11]
Shape: (3,4)
Strides: (1,3)

[[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9, 10, 11]]

Row-Major Layout (C-style)

Memory: [0,1,2,3,4,5,6,7,8,9,10,11]
Shape: (3,4)
Strides: (4,1)

[[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9, 10, 11]]

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg new file mode 100644 index 0000000000..8e521229cf --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tensor_views_4.svg @@ -0,0 +1 @@ +

Optimization Strategies

Memory Access Patterns

Sequential Access
(Good cache usage)

Strided Access
(May cause cache misses)

Random Access
(Poor cache usage)

Use row-major for row iteration

Use col-major for column iteration

Minimize stride between accesses

Vectorize when possible

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg b/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg new file mode 100644 index 0000000000..2faec8d8d3 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tensor_views_5.svg @@ -0,0 +1 @@ +

Use Cases

TensorView

BufferView

Linear indexing only

buffer[5]

No shape information

Direct memory access

Multi-dimensional indexing

tensor(1, 2)

Shape-aware operations

Coordinate transformations

BufferView: Low-level memory ops

TensorView: Matrix/tensor algorithms

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg new file mode 100644 index 0000000000..19e7140013 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_1.svg @@ -0,0 +1 @@ +

GPU Execution

Coordinate Spaces

Logical View

Tensor
Multi-dimensional data

TileDistribution
Work assignment

TileWindow
Data view

X: Physical tensor coords

Y: Tile pattern coords

P: Processing element coords

R: Replication coords (optional)

Warps
32 threads each

Lanes
Thread within warp

Registers
Thread-local storage

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg new file mode 100644 index 0000000000..6f588a46c4 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_2.svg @@ -0,0 +1 @@ +

Output

Transformation Pipeline

Input

Thread Coordinates
(warpId, laneId)

P → Y
Thread to pattern

Y → X
Pattern to physical

Y → D
Pattern to register

Memory Coordinates
Global addresses

Register Indices
Local storage

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg new file mode 100644 index 0000000000..0974e138fd --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_3.svg @@ -0,0 +1 @@ +

Memory Pattern

Thread Assignment

Problem Space (256×256 Matrix)

Full Matrix
65,536 elements

Tile 1
32×32

Tile 2
32×32

Tile N
32×32

Warp 0
32 threads

Warp 1
32 threads

Lane 0-31
Individual threads

Coalesced Access
Sequential addresses
No bank conflicts

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg new file mode 100644 index 0000000000..894151380d --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_4.svg @@ -0,0 +1 @@ +

Level 3: Thread Distribution

Level 2: Warp Distribution

Level 1: Block Distribution

Thread Block
256 threads

Block Tile 1
64×64

Block Tile 2
64×64

Warp
32 threads

Warp Tile 1
16×16

Warp Tile 2
16×16

Thread

Thread Tile
2×2

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg new file mode 100644 index 0000000000..2e46ee58cf --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_5.svg @@ -0,0 +1 @@ +

Memory Access

Per Thread

Thread Grid (32×32)

Matrix C (128×128)

16,384 elements

1,024 threads

4×4 tile
16 elements

Coalesced reads
Efficient writes
No conflicts

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg new file mode 100644 index 0000000000..2195465e60 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_6.svg @@ -0,0 +1 @@ +

Output

Stage 3

Stage 2

Stage 1

Input

Thread ID
(0-1023)

P-coordinates
(warp, lane)

Y-coordinates
(tile position)

X-coordinates
(tensor indices)

Memory addresses
Register indices

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg b/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg new file mode 100644 index 0000000000..e9ec5a5780 --- /dev/null +++ b/docs/conceptual/ck_tile/diagrams/tile_distribution_7.svg @@ -0,0 +1 @@ +

Performance

With TileDistribution

Manual Implementation

Calculate indices manually

Handle boundary conditions

Ensure coalescing

Manage bank conflicts

~200 lines of code

make_tile_distribution()

Automatic optimization

~10 lines of code

Same performance

Fewer bugs

Portable across GPUs

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/diagrams/tile_window_2.svg b/docs/conceptual/ck_tile/diagrams/tile_window_2.svg index a673c72866..60ec2dd1ce 100644 --- a/docs/conceptual/ck_tile/diagrams/tile_window_2.svg +++ b/docs/conceptual/ck_tile/diagrams/tile_window_2.svg @@ -1 +1 @@ -

Snake Access Pattern

0→1→2→3

7←6←5←4

8→9→10→11

15←14←13←12

Linear Access Pattern

0→1→2→3

4→5→6→7

8→9→10→11

12→13→14→15

\ No newline at end of file +

Snake Access Pattern

0,1,2,3

7,6,5,4

8,9,10,11

15,14,13,12

Linear Access Pattern

0,1,2,3

4,5,6,7

8,9,10,11

12,13,14,15

\ No newline at end of file diff --git a/docs/conceptual/ck_tile/introduction_motivation.rst b/docs/conceptual/ck_tile/introduction_motivation.rst index 66df4187fb..33aaf19627 100644 --- a/docs/conceptual/ck_tile/introduction_motivation.rst +++ b/docs/conceptual/ck_tile/introduction_motivation.rst @@ -17,75 +17,81 @@ In this introduction, we establish the fundamental problems that tile distributi The GPU Memory Problem ---------------------- -.. raw:: html +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Random Access Pattern (Inefficient)" + subgraph "Threads" + T0_R["Thread 0"] + T1_R["Thread 1"] + T2_R["Thread 2"] + T3_R["Thread 3"] + end -
- graph TB - subgraph "Random Access Pattern (Inefficient)" - subgraph "Threads" - T0_R["Thread 0"] - T1_R["Thread 1"] - T2_R["Thread 2"] - T3_R["Thread 3"] - end - - subgraph "Memory" - M0["Mem[0]"] - M7["Mem[7]"] - M15["Mem[15]"] - M23["Mem[23]"] - M31["Mem[31]"] - M39["Mem[39]"] - M47["Mem[47]"] - M55["Mem[55]"] - end - - T0_R -.-> M23 - T1_R -.-> M7 - T2_R -.-> M47 - T3_R -.-> M15 - end - - subgraph "Tile Distribution Pattern (Efficient)" - subgraph "Threads_TD" - T0_TD["Thread 0"] - T1_TD["Thread 1"] - T2_TD["Thread 2"] - T3_TD["Thread 3"] - end - - subgraph "Memory_TD" - M0_TD["Mem[0]"] - M1_TD["Mem[1]"] - M2_TD["Mem[2]"] - M3_TD["Mem[3]"] - M4_TD["Mem[4]"] - M5_TD["Mem[5]"] - M6_TD["Mem[6]"] - M7_TD["Mem[7]"] - end - - T0_TD --> M0_TD - T0_TD --> M1_TD - T1_TD --> M2_TD - T1_TD --> M3_TD - T2_TD --> M4_TD - T2_TD --> M5_TD - T3_TD --> M6_TD - T3_TD --> M7_TD - end - - style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px - - style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px - style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px - style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px - style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px -
+ subgraph "Memory" + M0["Mem[0]"] + M7["Mem[7]"] + M15["Mem[15]"] + M23["Mem[23]"] + M31["Mem[31]"] + M39["Mem[39]"] + M47["Mem[47]"] + M55["Mem[55]"] + end + T0_R -.-> M23 + T1_R -.-> M7 + T2_R -.-> M47 + T3_R -.-> M15 + end + + subgraph "Tile Distribution Pattern (Efficient)" + subgraph "Threads_TD" + T0_TD["Thread 0"] + T1_TD["Thread 1"] + T2_TD["Thread 2"] + T3_TD["Thread 3"] + end + + subgraph "Memory_TD" + M0_TD["Mem[0]"] + M1_TD["Mem[1]"] + M2_TD["Mem[2]"] + M3_TD["Mem[3]"] + M4_TD["Mem[4]"] + M5_TD["Mem[5]"] + M6_TD["Mem[6]"] + M7_TD["Mem[7]"] + end + + T0_TD --> M0_TD + T0_TD --> M1_TD + T1_TD --> M2_TD + T1_TD --> M3_TD + T2_TD --> M4_TD + T2_TD --> M5_TD + T3_TD --> M6_TD + T3_TD --> M7_TD + end + + style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px + + style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px + style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px + style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px + style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px + + + +.. image:: diagrams/introduction_motivation_1.svg + :alt: Diagram + :align: center Why Random Memory Access is Slow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -209,36 +215,42 @@ The Coordinate Mapping Insight At the heart of tile distribution lies a profound mathematical insight: efficient GPU computation requires a systematic framework for mapping between different coordinate spaces. This framework transforms the complex problem of thread-to-data assignment into a series of well-defined mathematical transformations, each serving a specific purpose in the journey from abstract algorithm to concrete hardware execution. -.. raw:: html +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph LR + subgraph "Coordinate Spaces" + P["P-space
Thread Position
(thread_x, thread_y,
warp_id, block_id)"] + Y["Y-space
Local Data
(y0, y1, y2, y3)"] + X["X-space
Global Position
(x0, x1)"] + D["D-space
Memory Address
(linearized)"] + end -
- graph LR - subgraph "Coordinate Spaces" - P["P-space
Thread Position
(thread_x, thread_y,
warp_id, block_id)"] - Y["Y-space
Local Data
(y0, y1, y2, y3)"] - X["X-space
Global Position
(x0, x1)"] - D["D-space
Memory Address
(linearized)"] - end - - subgraph "Transformations" - T1["P + Y → X
Thread data mapping"] - T2["X → D
Memory linearization"] - end - - P --> T1 - Y --> T1 - T1 --> X - X --> T2 - T2 --> D - - style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px - style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px - style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px -
+ subgraph "Transformations" + T1["P + Y → X
Thread data mapping"] + T2["X → D
Memory linearization"] + end + P --> T1 + Y --> T1 + T1 --> X + X --> T2 + T2 --> D + + style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px + style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + + + +.. image:: diagrams/introduction_motivation_2.svg + :alt: Diagram + :align: center The elegance of this approach emerges from its separation of concerns. Each coordinate space represents a distinct aspect of the computation, and the transformations between them encapsulate specific optimization strategies. This separation allows developers to reason about their algorithms in natural terms while the framework handles the complex mapping to efficient hardware execution patterns. **P-space (Thread Position Space)** represents the physical organization of threads on the GPU. This space captures the hierarchical nature of GPU execution, from individual threads identified by their x and y coordinates within a block, to warps that execute in lockstep, to thread blocks that share resources. The coordinates in P-space—thread_x, thread_y, warp_id, and block_id—directly correspond to the hardware's execution model. Understanding P-space is crucial because it determines which threads can cooperate efficiently through shared memory and which threads will execute their memory accesses simultaneously. diff --git a/docs/conceptual/ck_tile/lds_index_swapping.rst b/docs/conceptual/ck_tile/lds_index_swapping.rst index 35211d0368..3b246dad67 100644 --- a/docs/conceptual/ck_tile/lds_index_swapping.rst +++ b/docs/conceptual/ck_tile/lds_index_swapping.rst @@ -28,42 +28,50 @@ The original K coordinate is split into K0 and K1, where K1 represents the threa .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "3D LDS coordinate [K0, M, K1]" - K0["KPerBlock/KPack * MLdsLayer
K0"] - M["MPerBlock/MLdsLayer
M"] - K1["KPack
K1"] - end + .. mermaid:: - subgraph "XOR Transform" - XT["make_xor_transform"] - end + graph TB + subgraph "3D LDS coordinate [K0, M, K1]" + K0["KPerBlock/KPack * MLdsLayer
K0"] + M["MPerBlock/MLdsLayer
M"] + K1["KPack
K1"] + end + + subgraph "XOR Transform" + XT["make_xor_transform"] + end + + subgraph "Update K0 with XOR transformation" + K01["KPerBlock/KPack * MLdsLayer
K0'"] + M1["MPerBlock/MLdsLayer
M"] + K11["KPack
K1"] + end + + K0 --> XT + M --> XT + K1 --> K11 + + XT --> K01 + XT --> M1 + + style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + + style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - subgraph "Update K0 with XOR transformation" - K01["KPerBlock/KPack * MLdsLayer
K0'"] - M1["MPerBlock/MLdsLayer
M"] - K11["KPack
K1"] - end - K0 --> XT - M --> XT - K1 --> K11 - - XT --> K01 - XT --> M1 - - style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - - style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px +.. image:: diagrams/lds_index_swapping_1.svg + :alt: Diagram + :align: center .. image:: diagrams/lds_index_swapping_1.svg :alt: Diagram :align: center @@ -83,45 +91,53 @@ The transformed K0' is split into L and K0'' components, creating an intermediat .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "3D LDS coordinate [K0', M, K1]" - K0["KPerBlock/KPack * MLdsLayer
K0'"] - M["MPerBlock/MLdsLayer
M"] - K1["KPack
K1"] - end + .. mermaid:: - subgraph "Unmerge into 2 components" - UM["make_unmerge_transform"] - end + graph TB + subgraph "3D LDS coordinate [K0', M, K1]" + K0["KPerBlock/KPack * MLdsLayer
K0'"] + M["MPerBlock/MLdsLayer
M"] + K1["KPack
K1"] + end + + subgraph "Unmerge into 2 components" + UM["make_unmerge_transform"] + end + + subgraph "4D intermediate transformation space" + L["MLdsLayer
L"] + M1["MPerBlock/MLdsLayer
M"] + K01["KPerBlock/KPack
K0''"] + K11["KPack
K1"] + end + + K0 --> UM + M --> M1 + K1 --> K11 + + UM --> L + UM --> K01 + + style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + + style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - subgraph "4D intermediate transformation space" - L["MLdsLayer
L"] - M1["MPerBlock/MLdsLayer
M"] - K01["KPerBlock/KPack
K0''"] - K11["KPack
K1"] - end - K0 --> UM - M --> M1 - K1 --> K11 - - UM --> L - UM --> K01 - - style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - - style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - - style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px +.. image:: diagrams/lds_index_swapping_2.svg + :alt: Diagram + :align: center .. image:: diagrams/lds_index_swapping_2.svg :alt: Diagram :align: center @@ -142,50 +158,58 @@ The final step merges the 4D coordinates back into 2D transformed coordinates (M .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "4D LDS Coordinates [L, M, K0'', K1]" - L["MLdsLayer
L"] - M1["MPerBlock/MLdsLayer
M"] - K0["KPerBlock/KPack
K0''"] - K1["KPack
K1"] - end + .. mermaid:: - subgraph "Merge into 1 component" - ME0["make_merge_transform"] - end + graph TB + subgraph "4D LDS Coordinates [L, M, K0'', K1]" + L["MLdsLayer
L"] + M1["MPerBlock/MLdsLayer
M"] + K0["KPerBlock/KPack
K0''"] + K1["KPack
K1"] + end + + subgraph "Merge into 1 component" + ME0["make_merge_transform"] + end + + subgraph "Merge into 1 component" + ME1["make_merge_transform"] + end + + subgraph "Transformed 2D coordinates [M', K']" + M11["MPerBlock
M'"] + K01["KPerBlock
K'"] + end + + L --> ME0 + M1 --> ME0 + + K0 --> ME1 + K1 --> ME1 + + ME0 --> M11 + ME1 --> K01 + + style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + + style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - subgraph "Merge into 1 component" - ME1["make_merge_transform"] - end - subgraph "Transformed 2D coordinates [M', K']" - M11["MPerBlock
M'"] - K01["KPerBlock
K'"] - end - L --> ME0 - M1 --> ME0 - K0 --> ME1 - K1 --> ME1 - - ME0 --> M11 - ME1 --> K01 - - style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - - style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - - +.. image:: diagrams/lds_index_swapping_3.svg + :alt: Diagram + :align: center .. image:: diagrams/lds_index_swapping_3.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/load_store_traits.rst b/docs/conceptual/ck_tile/load_store_traits.rst index 7816157d0c..f269a406f0 100644 --- a/docs/conceptual/ck_tile/load_store_traits.rst +++ b/docs/conceptual/ck_tile/load_store_traits.rst @@ -106,27 +106,35 @@ LoadStoreTraits employs a advanced algorithm to select the best dimension for ve .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TD - A[Analyze Distribution] --> B{Check Each Dimension} - B --> C[Calculate Stride] - C --> D{Stride == 1?} - D -->|Yes| E[Candidate for Vectorization] - D -->|No| F[Skip Dimension] - E --> G[Check Alignment] - G --> H[Check Vector Size] - H --> I[Score Dimension] - F --> B - I --> J[Select Best Dimension] - J --> K[Configure Vector Access] - - style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px + .. mermaid:: + + graph TD + A[Analyze Distribution] --> B{Check Each Dimension} + B --> C[Calculate Stride] + C --> D{Stride == 1?} + D -->|Yes| E[Candidate for Vectorization] + D -->|No| F[Skip Dimension] + E --> G[Check Alignment] + G --> H[Check Vector Size] + H --> I[Score Dimension] + F --> B + I --> J[Select Best Dimension] + J --> K[Configure Vector Access] + + style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px + + +.. image:: diagrams/load_store_traits_1.svg + :alt: Diagram + :align: center .. image:: diagrams/load_store_traits_1.svg :alt: Diagram :align: center @@ -174,36 +182,44 @@ LoadStoreTraits creates efficient access patterns using space-filling curves: .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph LR - subgraph "Linear Traversal" - L1["0→1→2→3"] - L2["4→5→6→7"] - L3["Cache miss"] - L4["8→9→10→11"] - end - - subgraph "Snake Pattern" - S1["0→1→2→3"] - S2["7←6←5←4"] - S3["Cache hit!"] - S4["8→9→10→11"] - end - - L1 --> L2 - L2 --> L3 - L3 --> L4 - - S1 --> S2 - S2 --> S3 - S3 --> S4 - - style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px + .. mermaid:: + + graph LR + subgraph "Linear Traversal" + L1["0→1→2→3"] + L2["4→5→6→7"] + L3["Cache miss"] + L4["8→9→10→11"] + end + + subgraph "Snake Pattern" + S1["0→1→2→3"] + S2["7←6←5←4"] + S3["Cache hit!"] + S4["8→9→10→11"] + end + + L1 --> L2 + L2 --> L3 + L3 --> L4 + + S1 --> S2 + S2 --> S3 + S3 --> S4 + + style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px + + +.. image:: diagrams/load_store_traits_2.svg + :alt: Diagram + :align: center .. image:: diagrams/load_store_traits_2.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/space_filling_curve.rst b/docs/conceptual/ck_tile/space_filling_curve.rst index 5d98f2fd3d..67b9b53824 100644 --- a/docs/conceptual/ck_tile/space_filling_curve.rst +++ b/docs/conceptual/ck_tile/space_filling_curve.rst @@ -188,36 +188,44 @@ The snake pattern reverses traversal direction on alternate rows, minimizing the .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph LR - subgraph "Linear Pattern" - L1["Row 0: →"] - L2["Row 1: →"] - L3["Jump back"] - L4["Row 2: →"] - end - - subgraph "Snake Pattern" - S1["Row 0: →"] - S2["Row 1: ←"] - S3["Continue"] - S4["Row 2: →"] - end - - L1 --> L3 - L3 --> L2 - L2 --> L3 - L3 --> L4 - - S1 --> S2 - S2 --> S4 - - style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px + .. mermaid:: + + graph LR + subgraph "Linear Pattern" + L1["Row 0: →"] + L2["Row 1: →"] + L3["Jump back"] + L4["Row 2: →"] + end + + subgraph "Snake Pattern" + S1["Row 0: →"] + S2["Row 1: ←"] + S3["Continue"] + S4["Row 2: →"] + end + + L1 --> L3 + L3 --> L2 + L2 --> L3 + L3 --> L4 + + S1 --> S2 + S2 --> S4 + + style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px + + +.. image:: diagrams/space_filling_curve.svg + :alt: Diagram + :align: center .. image:: diagrams/space_filling_curve.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/static_distributed_tensor.rst b/docs/conceptual/ck_tile/static_distributed_tensor.rst index fe7ca2a365..a454a8ca38 100644 --- a/docs/conceptual/ck_tile/static_distributed_tensor.rst +++ b/docs/conceptual/ck_tile/static_distributed_tensor.rst @@ -88,21 +88,29 @@ The memory layout follows a hierarchical pattern: .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TD - A[Global Tensor 64x64] --> B[Thread Block 16x16] - B --> C[Thread 0,0
Elements 0:3,0:3] - B --> D[Thread 0,1
Elements 0:3,4:7] - B --> E[Thread 1,0
Elements 4:7,0:3] - B --> F[...] - - C --> G[Local Array
16 elements] - D --> H[Local Array
16 elements] - E --> I[Local Array
16 elements] + .. mermaid:: + + graph TD + A[Global Tensor 64x64] --> B[Thread Block 16x16] + B --> C[Thread 0,0
Elements 0:3,0:3] + B --> D[Thread 0,1
Elements 0:3,4:7] + B --> E[Thread 1,0
Elements 4:7,0:3] + B --> F[...] + + C --> G[Local Array
16 elements] + D --> H[Local Array
16 elements] + E --> I[Local Array
16 elements] + + +.. image:: diagrams/static_distributed_tensor.svg + :alt: Diagram + :align: center .. image:: diagrams/static_distributed_tensor.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/sweep_tile.rst b/docs/conceptual/ck_tile/sweep_tile.rst index 7ad7040764..25208cae05 100644 --- a/docs/conceptual/ck_tile/sweep_tile.rst +++ b/docs/conceptual/ck_tile/sweep_tile.rst @@ -18,35 +18,43 @@ The key insight is the "load once, use many times" pattern. Load X data once int .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - flowchart LR - subgraph "X-Tile (Reused)" - XT["X data loaded once
Stays in registers"] - end - - subgraph "Y-Sweep" - Y1["Y position 0"] - Y2["Y position 1"] - Y3["Y position 2"] - YN["Y position N"] - end - - subgraph "Computation" - C["Process(X, Y)"] - end - - XT --> C - Y1 --> C - Y2 --> C - Y3 --> C - YN --> C - - style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + .. mermaid:: + + flowchart LR + subgraph "X-Tile (Reused)" + XT["X data loaded once
Stays in registers"] + end + + subgraph "Y-Sweep" + Y1["Y position 0"] + Y2["Y position 1"] + Y3["Y position 2"] + YN["Y position N"] + end + + subgraph "Computation" + C["Process(X, Y)"] + end + + XT --> C + Y1 --> C + Y2 --> C + Y3 --> C + YN --> C + + style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + + +.. image:: diagrams/sweep_tile_1.svg + :alt: Diagram + :align: center .. image:: diagrams/sweep_tile_1.svg :alt: Diagram :align: center @@ -122,30 +130,38 @@ The sweep pattern provides significant memory efficiency benefits: .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "Traditional Approach" - T1["Load X[0]"] --> P1["Process"] - T2["Load Y[0]"] --> P1 - T3["Load X[0]"] --> P2["Process"] - T4["Load Y[1]"] --> P2 - T5["Load X[0]"] --> P3["Process"] - T6["Load Y[2]"] --> P3 - Note1["X loaded 3 times!"] - end - - subgraph "Sweep Approach" - S1["Load X[0]"] --> SP["Process with
Y[0], Y[1], Y[2]"] - S2["Load Y[0,1,2]"] --> SP - Note2["X loaded once!"] - end - - style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px + .. mermaid:: + + graph TB + subgraph "Traditional Approach" + T1["Load X[0]"] --> P1["Process"] + T2["Load Y[0]"] --> P1 + T3["Load X[0]"] --> P2["Process"] + T4["Load Y[1]"] --> P2 + T5["Load X[0]"] --> P3["Process"] + T6["Load Y[2]"] --> P3 + Note1["X loaded 3 times!"] + end + + subgraph "Sweep Approach" + S1["Load X[0]"] --> SP["Process with
Y[0], Y[1], Y[2]"] + S2["Load Y[0,1,2]"] --> SP + Note2["X loaded once!"] + end + + style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px + + +.. image:: diagrams/sweep_tile_2.svg + :alt: Diagram + :align: center .. image:: diagrams/sweep_tile_2.svg :alt: Diagram :align: center @@ -381,37 +397,45 @@ Sweep operations provide several performance benefits: .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "Sweep Performance Benefits" - B1["Zero runtime overhead
Compile-time unrolling"] - B2["Perfect memory coalescing
Sequential access patterns"] - B3["Automatic vectorization
Compiler optimizations"] - B4["Register reuse
X data stays in VGPR"] - end - - subgraph "Use Cases" - U1["Matrix Multiplication
Reuse A columns"] - U2["Convolution
Reuse filter weights"] - U3["Reduction
Accumulate over Y"] - U4["Broadcast
Apply X to all Y"] - end - - B1 --> Performance["High Performance"] - B2 --> Performance - B3 --> Performance - B4 --> Performance - - Performance --> U1 - Performance --> U2 - Performance --> U3 - Performance --> U4 - - style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px + .. mermaid:: + + graph TB + subgraph "Sweep Performance Benefits" + B1["Zero runtime overhead
Compile-time unrolling"] + B2["Perfect memory coalescing
Sequential access patterns"] + B3["Automatic vectorization
Compiler optimizations"] + B4["Register reuse
X data stays in VGPR"] + end + + subgraph "Use Cases" + U1["Matrix Multiplication
Reuse A columns"] + U2["Convolution
Reuse filter weights"] + U3["Reduction
Accumulate over Y"] + U4["Broadcast
Apply X to all Y"] + end + + B1 --> Performance["High Performance"] + B2 --> Performance + B3 --> Performance + B4 --> Performance + + Performance --> U1 + Performance --> U2 + Performance --> U3 + Performance --> U4 + + style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px + + +.. image:: diagrams/sweep_tile_3.svg + :alt: Diagram + :align: center .. image:: diagrams/sweep_tile_3.svg :alt: Diagram :align: center @@ -453,28 +477,36 @@ Complete workflow example: .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - flowchart TB - subgraph "Complete Workflow" - TD["TileDistribution
Define data layout"] - TW["TileWindow
Create view"] - DT["DistributedTensor
Load X data"] - ST["SweepTile
Iterate Y positions"] - R["Results
Store outputs"] - end - - TD --> TW - TW --> DT - DT --> ST - ST --> R - - style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + .. mermaid:: + + flowchart TB + subgraph "Complete Workflow" + TD["TileDistribution
Define data layout"] + TW["TileWindow
Create view"] + DT["DistributedTensor
Load X data"] + ST["SweepTile
Iterate Y positions"] + R["Results
Store outputs"] + end + + TD --> TW + TW --> DT + DT --> ST + ST --> R + + style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + +.. image:: diagrams/sweep_tile_4.svg + :alt: Diagram + :align: center .. image:: diagrams/sweep_tile_4.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/tensor_views.rst b/docs/conceptual/ck_tile/tensor_views.rst index 7866eb9e58..0ba4bb4e82 100644 --- a/docs/conceptual/ck_tile/tensor_views.rst +++ b/docs/conceptual/ck_tile/tensor_views.rst @@ -13,40 +13,52 @@ The power of TensorView lies in its ability to present different logical views o TensorView Architecture ----------------------- -.. raw:: html - -
- graph TB - subgraph "Memory Foundation" - Memory["Flat Memory Array
0 1 2 3 4 5 6 7 8 9 10 11"] - end - - subgraph "Access Layer" - BufferView["BufferView
Linear Memory Access"] - Descriptor["TensorDescriptor
Shape & Stride Info"] - end - - subgraph "Tensor Layer" - TensorView["TensorView
Multi-dimensional Access"] - end - - subgraph "Logical View" - Matrix["2D Matrix View
[3×4]
[[0,1,2,3]
[4,5,6,7]
[8,9,10,11]]"] - end - - Memory --> BufferView - Memory --> Descriptor - BufferView --> TensorView - Descriptor --> TensorView - TensorView --> Matrix - - style Memory fill:#d1fae5,stroke:#10b981,stroke-width:2px - style BufferView fill:#dbeafe,stroke:#3b82f6,stroke-width:2px - style Descriptor fill:#fed7aa,stroke:#f59e0b,stroke-width:2px - style TensorView fill:#fce7f3,stroke:#ec4899,stroke-width:2px - style Matrix fill:#e9d5ff,stroke:#9333ea,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Memory Foundation" + Memory["Flat Memory Array
0 1 2 3 4 5 6 7 8 9 10 11"] + end + + subgraph "Access Layer" + BufferView["BufferView
Linear Memory Access"] + Descriptor["TensorDescriptor
Shape & Stride Info"] + end + + subgraph "Tensor Layer" + TensorView["TensorView
Multi-dimensional Access"] + end + + subgraph "Logical View" + Matrix["2D Matrix View
[3×4]
[[0,1,2,3]
[4,5,6,7]
[8,9,10,11]]"] + end + + Memory --> BufferView + Memory --> Descriptor + BufferView --> TensorView + Descriptor --> TensorView + TensorView --> Matrix + + style Memory fill:#d1fae5,stroke:#10b981,stroke-width:2px + style BufferView fill:#dbeafe,stroke:#3b82f6,stroke-width:2px + style Descriptor fill:#fed7aa,stroke:#f59e0b,stroke-width:2px + style TensorView fill:#fce7f3,stroke:#ec4899,stroke-width:2px + style Matrix fill:#e9d5ff,stroke:#9333ea,stroke-width:2px + + + + + +.. image:: diagrams/tensor_views_1.svg + :alt: Diagram + :align: center The Foundation: BufferView and TensorDescriptor ------------------------------------------------ @@ -109,68 +121,92 @@ Coordinate-Based Access The fundamental operation of TensorView is translating multi-dimensional coordinates into memory accesses. This translation happens through a advanced pipeline that maintains efficiency while providing flexibility: -.. raw:: html - -
- flowchart LR - subgraph "User Input" - Coord["Coordinate
(1, 2)"] - end - - subgraph "TensorView Processing" - Shape["Shape Check
row < 3?
col < 4?"] - Stride["Apply Strides
offset = 1×4 + 2×1"] - Buffer["BufferView Access
buffer[6]"] - end - - subgraph "Result" - Value["Value: 6"] - end - - Coord --> Shape - Shape -->|Valid| Stride - Stride --> Buffer - Buffer --> Value - - style Coord fill:#e0e7ff,stroke:#4338ca,stroke-width:2px - style Shape fill:#fef3c7,stroke:#f59e0b,stroke-width:2px - style Stride fill:#dcfce7,stroke:#10b981,stroke-width:2px - style Buffer fill:#dbeafe,stroke:#3b82f6,stroke-width:2px - style Value fill:#d1fae5,stroke:#10b981,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + flowchart LR + subgraph "User Input" + Coord["Coordinate
(1, 2)"] + end + + subgraph "TensorView Processing" + Shape["Shape Check
row < 3?
col < 4?"] + Stride["Apply Strides
offset = 1×4 + 2×1"] + Buffer["BufferView Access
buffer[6]"] + end + + subgraph "Result" + Value["Value: 6"] + end + + Coord --> Shape + Shape -->|Valid| Stride + Stride --> Buffer + Buffer --> Value + + style Coord fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + style Shape fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + style Stride fill:#dcfce7,stroke:#10b981,stroke-width:2px + style Buffer fill:#dbeafe,stroke:#3b82f6,stroke-width:2px + style Value fill:#d1fae5,stroke:#10b981,stroke-width:2px + + + + + +.. image:: diagrams/tensor_views_2.svg + :alt: Diagram + :align: center Memory Layouts and Strides -------------------------- One of the most key features of TensorView is its ability to represent different memory layouts through stride manipulation. This capability enables zero-copy transformations that would otherwise require expensive memory operations: -.. raw:: html - -
- graph TB - subgraph "Row-Major Layout (C-style)" - RM["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]
Shape: (3,4)
Strides: (4,1)"] - RMMatrix["[[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9, 10, 11]]"] - RM --> RMMatrix - end - - subgraph "Column-Major Layout (Fortran-style)" - CM["Memory: [0,3,6,9,1,4,7,10,2,5,8,11]
Shape: (3,4)
Strides: (1,3)"] - CMMatrix["[[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9, 10, 11]]"] - CM --> CMMatrix - end - - subgraph "Custom Stride (Transposed View)" - TV["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]
Shape: (4,3)
Strides: (1,4)"] - TVMatrix["[[0, 4, 8]
[1, 5, 9]
[2, 6, 10]
[3, 7, 11]]"] - TV --> TVMatrix - end - - style RM fill:#e0f2fe,stroke:#0284c7,stroke-width:2px - style CM fill:#fef3c7,stroke:#f59e0b,stroke-width:2px - style TV fill:#f3e8ff,stroke:#9333ea,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Row-Major Layout (C-style)" + RM["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]
Shape: (3,4)
Strides: (4,1)"] + RMMatrix["[[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9, 10, 11]]"] + RM --> RMMatrix + end + + subgraph "Column-Major Layout (Fortran-style)" + CM["Memory: [0,3,6,9,1,4,7,10,2,5,8,11]
Shape: (3,4)
Strides: (1,3)"] + CMMatrix["[[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9, 10, 11]]"] + CM --> CMMatrix + end + + subgraph "Custom Stride (Transposed View)" + TV["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]
Shape: (4,3)
Strides: (1,4)"] + TVMatrix["[[0, 4, 8]
[1, 5, 9]
[2, 6, 10]
[3, 7, 11]]"] + TV --> TVMatrix + end + + style RM fill:#e0f2fe,stroke:#0284c7,stroke-width:2px + style CM fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + style TV fill:#f3e8ff,stroke:#9333ea,stroke-width:2px + + + + + +.. image:: diagrams/tensor_views_3.svg + :alt: Diagram + :align: center Row-Major vs Column-Major Layouts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -287,33 +323,45 @@ Memory Access Patterns The efficiency of TensorView operations depends critically on memory access patterns. Understanding these patterns is essential for achieving optimal performance (see :ref:`ck_tile_gpu_basics` for hardware considerations): -.. raw:: html - -
- graph LR - subgraph "Memory Access Patterns" - Seq["Sequential Access
(Good cache usage)"] - Stride["Strided Access
(May cause cache misses)"] - Random["Random Access
(Poor cache usage)"] - end - - subgraph "Optimization Strategies" - Opt1["Use row-major for row iteration"] - Opt2["Use col-major for column iteration"] - Opt3["Minimize stride between accesses"] - Opt4["Vectorize when possible"] - end - - Seq --> Opt1 - Stride --> Opt2 - Stride --> Opt3 - Random --> Opt4 - - style Seq fill:#d1fae5,stroke:#10b981,stroke-width:2px - style Stride fill:#fef3c7,stroke:#f59e0b,stroke-width:2px - style Random fill:#fee2e2,stroke:#ef4444,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph LR + subgraph "Memory Access Patterns" + Seq["Sequential Access
(Good cache usage)"] + Stride["Strided Access
(May cause cache misses)"] + Random["Random Access
(Poor cache usage)"] + end + + subgraph "Optimization Strategies" + Opt1["Use row-major for row iteration"] + Opt2["Use col-major for column iteration"] + Opt3["Minimize stride between accesses"] + Opt4["Vectorize when possible"] + end + + Seq --> Opt1 + Stride --> Opt2 + Stride --> Opt3 + Random --> Opt4 + + style Seq fill:#d1fae5,stroke:#10b981,stroke-width:2px + style Stride fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + style Random fill:#fee2e2,stroke:#ef4444,stroke-width:2px + + + + + +.. image:: diagrams/tensor_views_4.svg + :alt: Diagram + :align: center Compile-Time Optimization ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -339,36 +387,48 @@ TensorView vs BufferView Understanding when to use TensorView versus BufferView is crucial for writing efficient code: -.. raw:: html - -
- graph TB - subgraph "BufferView" - BV1["Linear indexing only"] - BV2["buffer[5]"] - BV3["No shape information"] - BV4["Direct memory access"] - end - - subgraph "TensorView" - TV1["Multi-dimensional indexing"] - TV2["tensor(1, 2)"] - TV3["Shape-aware operations"] - TV4["Coordinate transformations"] - end - - subgraph "Use Cases" - UC1["BufferView: Low-level memory ops"] - UC2["TensorView: Matrix/tensor algorithms"] - end - - BV1 --> UC1 - TV1 --> UC2 - - style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px - style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "BufferView" + BV1["Linear indexing only"] + BV2["buffer[5]"] + BV3["No shape information"] + BV4["Direct memory access"] + end + + subgraph "TensorView" + TV1["Multi-dimensional indexing"] + TV2["tensor(1, 2)"] + TV3["Shape-aware operations"] + TV4["Coordinate transformations"] + end + + subgraph "Use Cases" + UC1["BufferView: Low-level memory ops"] + UC2["TensorView: Matrix/tensor algorithms"] + end + + BV1 --> UC1 + TV1 --> UC2 + + style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px + style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px + + + + + +.. image:: diagrams/tensor_views_5.svg + :alt: Diagram + :align: center BufferView excels at raw memory operations where linear access is natural or where the overhead of coordinate calculation would be prohibitive. TensorView shines when algorithms naturally think in terms of multi-dimensional coordinates, such as matrix operations, image processing, or tensor contractions. Integration with Tile Distribution diff --git a/docs/conceptual/ck_tile/thread_mapping.rst b/docs/conceptual/ck_tile/thread_mapping.rst index 69135c54fc..29dd817b9f 100644 --- a/docs/conceptual/ck_tile/thread_mapping.rst +++ b/docs/conceptual/ck_tile/thread_mapping.rst @@ -82,53 +82,61 @@ Composable Kernel abstracts thread identification into partition indices, buildi .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "GPU Device" - subgraph "Thread Block" - subgraph "Warp 0" - T0["Thread 0
lane_id=0"] - T1["Thread 1
lane_id=1"] - T2["..."] - T31["Thread 31
lane_id=31"] - end - - subgraph "Warp 1" - T32["Thread 32
lane_id=0"] - T33["Thread 33
lane_id=1"] - T34["..."] - T63["Thread 63
lane_id=31"] - end - - W2["Warp 2"] - W3["..."] - W7["Warp 7"] - end - end - - subgraph "Thread Identification" - TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"] - WID["Warp ID = threadIdx.x / 32"] - LID["Lane ID = threadIdx.x % 32"] - end - - subgraph "P-space Mapping" - P["P-coordinates
NDimP=1: [thread_id]
NDimP=2: [warp_id, lane_id]"] - end - - T0 --> TID - TID --> WID - TID --> LID - WID --> P - LID --> P - - style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px + .. mermaid:: + + graph TB + subgraph "GPU Device" + subgraph "Thread Block" + subgraph "Warp 0" + T0["Thread 0
lane_id=0"] + T1["Thread 1
lane_id=1"] + T2["..."] + T31["Thread 31
lane_id=31"] + end + + subgraph "Warp 1" + T32["Thread 32
lane_id=0"] + T33["Thread 33
lane_id=1"] + T34["..."] + T63["Thread 63
lane_id=31"] + end + + W2["Warp 2"] + W3["..."] + W7["Warp 7"] + end + end + + subgraph "Thread Identification" + TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"] + WID["Warp ID = threadIdx.x / 32"] + LID["Lane ID = threadIdx.x % 32"] + end + + subgraph "P-space Mapping" + P["P-coordinates
NDimP=1: [thread_id]
NDimP=2: [warp_id, lane_id]"] + end + + T0 --> TID + TID --> WID + TID --> LID + WID --> P + LID --> P + + style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px + + +.. image:: diagrams/thread_mapping_1.svg + :alt: Diagram + :align: center .. image:: diagrams/thread_mapping_1.svg :alt: Diagram :align: center @@ -179,43 +187,51 @@ Once threads know their IDs, they need to map those IDs to specific data element .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "Thread to Data Mapping" - subgraph "Thread Grid" - T00["Thread[0,0]
Warp 0"] - T01["Thread[0,1]
Warp 0"] - T10["Thread[1,0]
Warp 1"] - T11["Thread[1,1]
Warp 1"] - end - - subgraph "Data Tiles" - D00["Data[0:4, 0:4]
16 elements"] - D01["Data[0:4, 4:8]
16 elements"] - D10["Data[4:8, 0:4]
16 elements"] - D11["Data[4:8, 4:8]
16 elements"] - end - - subgraph "Memory Access" - MA["Coalesced Access
Adjacent threads → Adjacent memory"] - end - end - - T00 --> D00 - T01 --> D01 - T10 --> D10 - T11 --> D11 - - D00 --> MA - D01 --> MA - - style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px + .. mermaid:: + + graph TB + subgraph "Thread to Data Mapping" + subgraph "Thread Grid" + T00["Thread[0,0]
Warp 0"] + T01["Thread[0,1]
Warp 0"] + T10["Thread[1,0]
Warp 1"] + T11["Thread[1,1]
Warp 1"] + end + + subgraph "Data Tiles" + D00["Data[0:4, 0:4]
16 elements"] + D01["Data[0:4, 4:8]
16 elements"] + D10["Data[4:8, 0:4]
16 elements"] + D11["Data[4:8, 4:8]
16 elements"] + end + + subgraph "Memory Access" + MA["Coalesced Access
Adjacent threads → Adjacent memory"] + end + end + + T00 --> D00 + T01 --> D01 + T10 --> D10 + T11 --> D11 + + D00 --> MA + D01 --> MA + + style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px + + +.. image:: diagrams/thread_mapping_2.svg + :alt: Diagram + :align: center .. image:: diagrams/thread_mapping_2.svg :alt: Diagram :align: center diff --git a/docs/conceptual/ck_tile/tile_distribution.rst b/docs/conceptual/ck_tile/tile_distribution.rst index db7eb7a466..e36b3d60be 100644 --- a/docs/conceptual/ck_tile/tile_distribution.rst +++ b/docs/conceptual/ck_tile/tile_distribution.rst @@ -19,79 +19,103 @@ The elegance of this design lies in its ability to adapt to diverse computationa Complete Tile Distribution System Overview ------------------------------------------ -.. raw:: html - -
- graph TB - subgraph "Logical View" - T["Tensor
Multi-dimensional data"] - TD["TileDistribution
Work assignment"] - TW["TileWindow
Data view"] - end - - subgraph "Coordinate Spaces" - X["X: Physical tensor coords"] - Y["Y: Tile pattern coords"] - P["P: Processing element coords"] - R["R: Replication coords (optional)"] - end - - subgraph "GPU Execution" - W["Warps
32 threads each"] - L["Lanes
Thread within warp"] - REG["Registers
Thread-local storage"] - end - - T --> TD - TD --> TW - - TD --> X - TD --> Y - TD --> P - TD --> R - - P --> W - P --> L - TW --> REG - - style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Logical View" + T["Tensor
Multi-dimensional data"] + TD["TileDistribution
Work assignment"] + TW["TileWindow
Data view"] + end + + subgraph "Coordinate Spaces" + X["X: Physical tensor coords"] + Y["Y: Tile pattern coords"] + P["P: Processing element coords"] + R["R: Replication coords (optional)"] + end + + subgraph "GPU Execution" + W["Warps
32 threads each"] + L["Lanes
Thread within warp"] + REG["Registers
Thread-local storage"] + end + + T --> TD + TD --> TW + + TD --> X + TD --> Y + TD --> P + TD --> R + + P --> W + P --> L + TW --> REG + + style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + + + + +.. image:: diagrams/tile_distribution_1.svg + :alt: Diagram + :align: center Coordinate System Architecture ------------------------------ -.. raw:: html - -
- flowchart LR - subgraph "Input" - TC["Thread Coordinates
(warpId, laneId)"] - end - - subgraph "Transformation Pipeline" - P2Y["P → Y
Thread to pattern"] - Y2X["Y → X
Pattern to physical"] - Y2D["Y → D
Pattern to register"] - end - - subgraph "Output" - MC["Memory Coordinates
Global addresses"] - RI["Register Indices
Local storage"] - end - - TC --> P2Y - P2Y --> Y2X - P2Y --> Y2D - Y2X --> MC - Y2D --> RI - - style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px - style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px - style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + flowchart LR + subgraph "Input" + TC["Thread Coordinates
(warpId, laneId)"] + end + + subgraph "Transformation Pipeline" + P2Y["P → Y
Thread to pattern"] + Y2X["Y → X
Pattern to physical"] + Y2D["Y → D
Pattern to register"] + end + + subgraph "Output" + MC["Memory Coordinates
Global addresses"] + RI["Register Indices
Local storage"] + end + + TC --> P2Y + P2Y --> Y2X + P2Y --> Y2D + Y2X --> MC + Y2D --> RI + + style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px + style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + + + + + +.. image:: diagrams/tile_distribution_2.svg + :alt: Diagram + :align: center What is Tile Distribution? -------------------------- @@ -132,41 +156,53 @@ The key insight that makes TileDistribution effective is its ability to abstract Problem Space Mapping --------------------- -.. raw:: html - -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + + graph TB + subgraph "Problem Space (256×256 Matrix)" + M["Full Matrix
65,536 elements"] + T1["Tile 1
32×32"] + T2["Tile 2
32×32"] + TN["Tile N
32×32"] + end + + subgraph "Thread Assignment" + W0["Warp 0
32 threads"] + W1["Warp 1
32 threads"] + L0["Lane 0-31
Individual threads"] + end + + subgraph "Memory Pattern" + MP["Coalesced Access
Sequential addresses
No bank conflicts"] + end + + M --> T1 + M --> T2 + M --> TN + + T1 --> W0 + T1 --> W1 + W0 --> L0 + L0 --> MP + + style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px + + + + - graph TB - subgraph "Problem Space (256×256 Matrix)" - M["Full Matrix
65,536 elements"] - T1["Tile 1
32×32"] - T2["Tile 2
32×32"] - TN["Tile N
32×32"] - end - - subgraph "Thread Assignment" - W0["Warp 0
32 threads"] - W1["Warp 1
32 threads"] - L0["Lane 0-31
Individual threads"] - end - - subgraph "Memory Pattern" - MP["Coalesced Access
Sequential addresses
No bank conflicts"] - end - - M --> T1 - M --> T2 - M --> TN - - T1 --> W0 - T1 --> W1 - W0 --> L0 - L0 --> MP - - style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px -
+.. image:: diagrams/tile_distribution_3.svg + :alt: Diagram + :align: center Creating a TileDistribution --------------------------- @@ -339,38 +375,50 @@ Let's see how to create and use a TileDistribution in practice: Hierarchical Decomposition -------------------------- -.. raw:: html - -
- graph TB - subgraph "Level 1: Block Distribution" - B["Thread Block
256 threads"] - BT1["Block Tile 1
64×64"] - BT2["Block Tile 2
64×64"] - end - - subgraph "Level 2: Warp Distribution" - W["Warp
32 threads"] - WT1["Warp Tile 1
16×16"] - WT2["Warp Tile 2
16×16"] - end - - subgraph "Level 3: Thread Distribution" - T["Thread"] - TT["Thread Tile
2×2"] - end - - B --> BT1 - BT1 --> W - W --> WT1 - WT1 --> T - T --> TT - - style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px - style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Level 1: Block Distribution" + B["Thread Block
256 threads"] + BT1["Block Tile 1
64×64"] + BT2["Block Tile 2
64×64"] + end + + subgraph "Level 2: Warp Distribution" + W["Warp
32 threads"] + WT1["Warp Tile 1
16×16"] + WT2["Warp Tile 2
16×16"] + end + + subgraph "Level 3: Thread Distribution" + T["Thread"] + TT["Thread Tile
2×2"] + end + + B --> BT1 + BT1 --> W + W --> WT1 + WT1 --> T + T --> TT + + style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px + style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + + + + +.. image:: diagrams/tile_distribution_4.svg + :alt: Diagram + :align: center Advanced Example: Matrix Multiplication Distribution ---------------------------------------------------- @@ -422,36 +470,48 @@ Advanced Example: Matrix Multiplication Distribution Work Distribution Pattern ------------------------- -.. raw:: html - -
- flowchart TB - subgraph "Matrix C (128×128)" - C["16,384 elements"] - end - - subgraph "Thread Grid (32×32)" - TG["1,024 threads"] - end - - subgraph "Per Thread" - PT["4×4 tile
16 elements"] - end - - subgraph "Memory Access" - MA["Coalesced reads
Efficient writes
No conflicts"] - end - - C --> TG - TG --> PT - PT --> MA - - style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px - style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px - style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + flowchart TB + subgraph "Matrix C (128×128)" + C["16,384 elements"] + end + + subgraph "Thread Grid (32×32)" + TG["1,024 threads"] + end + + subgraph "Per Thread" + PT["4×4 tile
16 elements"] + end + + subgraph "Memory Access" + MA["Coalesced reads
Efficient writes
No conflicts"] + end + + C --> TG + TG --> PT + PT --> MA + + style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px + + + + + +.. image:: diagrams/tile_distribution_5.svg + :alt: Diagram + :align: center Memory Access Patterns ---------------------- @@ -464,79 +524,103 @@ One of the key benefits of TileDistribution is generating optimal memory access Transformation Pipeline ----------------------- -.. raw:: html - -
- graph LR - subgraph "Input" - TID["Thread ID
(0-1023)"] - end - - subgraph "Stage 1" - P["P-coordinates
(warp, lane)"] - end - - subgraph "Stage 2" - Y["Y-coordinates
(tile position)"] - end - - subgraph "Stage 3" - X["X-coordinates
(tensor indices)"] - end - - subgraph "Output" - ADDR["Memory addresses
Register indices"] - end - - TID --> P - P --> Y - Y --> X - X --> ADDR - - style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px - style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph LR + subgraph "Input" + TID["Thread ID
(0-1023)"] + end + + subgraph "Stage 1" + P["P-coordinates
(warp, lane)"] + end + + subgraph "Stage 2" + Y["Y-coordinates
(tile position)"] + end + + subgraph "Stage 3" + X["X-coordinates
(tensor indices)"] + end + + subgraph "Output" + ADDR["Memory addresses
Register indices"] + end + + TID --> P + P --> Y + Y --> X + X --> ADDR + + style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px + style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px + + + + + +.. image:: diagrams/tile_distribution_6.svg + :alt: Diagram + :align: center Performance Comparison ---------------------- -.. raw:: html - -
- graph TB - subgraph "Manual Implementation" - M1["Calculate indices manually"] - M2["Handle boundary conditions"] - M3["Ensure coalescing"] - M4["Manage bank conflicts"] - M5["~200 lines of code"] - end - - subgraph "With TileDistribution" - T1["make_tile_distribution()"] - T2["Automatic optimization"] - T3["~10 lines of code"] - end - - subgraph "Performance" - P1["Same performance"] - P2["Fewer bugs"] - P3["Portable across GPUs"] - end - - M1 --> M5 - T1 --> T3 - - M5 --> P1 - T3 --> P1 - P1 --> P2 - P2 --> P3 - - style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px - style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px - style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px -
+.. + Original mermaid diagram (edit here, then run update_diagrams.py) + +.. + Original mermaid diagram (edit here, then run update_diagrams.py) + + .. mermaid:: + + graph TB + subgraph "Manual Implementation" + M1["Calculate indices manually"] + M2["Handle boundary conditions"] + M3["Ensure coalescing"] + M4["Manage bank conflicts"] + M5["~200 lines of code"] + end + + subgraph "With TileDistribution" + T1["make_tile_distribution()"] + T2["Automatic optimization"] + T3["~10 lines of code"] + end + + subgraph "Performance" + P1["Same performance"] + P2["Fewer bugs"] + P3["Portable across GPUs"] + end + + M1 --> M5 + T1 --> T3 + + M5 --> P1 + T3 --> P1 + P1 --> P2 + P2 --> P3 + + style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px + style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px + style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px + + + + + +.. image:: diagrams/tile_distribution_7.svg + :alt: Diagram + :align: center Summary ------- diff --git a/docs/conceptual/ck_tile/transforms.rst b/docs/conceptual/ck_tile/transforms.rst index 7c9e9a21b8..1ad6fdec78 100644 --- a/docs/conceptual/ck_tile/transforms.rst +++ b/docs/conceptual/ck_tile/transforms.rst @@ -32,28 +32,36 @@ Zero-Copy Logical Operations .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "Tensor Coordinate Transformation" - US["Lower Dimension Space
Source coordinate system"] - LS["Upper Dimension Space
Target coordinate system"] - - DATA["Linear Data in Memory
Layout determined by tensor
shape & strides"] - end - - US -->|"Forward Transform"| LS - LS -->|"Inverse Transform"| US - - DATA -.->|"Same data,
different views"| US - DATA -.->|"Same data,
different views"| LS - - style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "Tensor Coordinate Transformation" + US["Lower Dimension Space
Source coordinate system"] + LS["Upper Dimension Space
Target coordinate system"] + + DATA["Linear Data in Memory
Layout determined by tensor
shape & strides"] + end + + US -->|"Forward Transform"| LS + LS -->|"Inverse Transform"| US + + DATA -.->|"Same data,
different views"| US + DATA -.->|"Same data,
different views"| LS + + style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_1.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_1.svg :alt: Diagram :align: center @@ -74,38 +82,46 @@ Transform System Architecture .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - - subgraph "Transform Types" - EMB["EmbedTransform
Linear → Multi-D Strided"] - UNM["MergeTransform
Multi-D → Linear"] - MRG["UnmergeTransform
Linear → Multi-D"] - REP["ReplicateTransform
0D → Multi-D Broadcast"] - OFF["OffsetTransform
Translation"] - PAS["PassThroughTransform
Identity"] - PAD["PadTransform
Boundaries"] - end - - subgraph "Operations" - FWD["Forward
calculate_lower_index()"] - BWD["Backward
calculate_upper_index()"] - UPD["Update
update_lower_index()"] - end - - EMB --> FWD - UNM --> FWD - MRG --> FWD - REP --> FWD - OFF --> FWD - PAS --> FWD - PAD --> FWD - - style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + .. mermaid:: + + graph TB + + subgraph "Transform Types" + EMB["EmbedTransform
Linear → Multi-D Strided"] + UNM["MergeTransform
Multi-D → Linear"] + MRG["UnmergeTransform
Linear → Multi-D"] + REP["ReplicateTransform
0D → Multi-D Broadcast"] + OFF["OffsetTransform
Translation"] + PAS["PassThroughTransform
Identity"] + PAD["PadTransform
Boundaries"] + end + + subgraph "Operations" + FWD["Forward
calculate_lower_index()"] + BWD["Backward
calculate_upper_index()"] + UPD["Update
update_lower_index()"] + end + + EMB --> FWD + UNM --> FWD + MRG --> FWD + REP --> FWD + OFF --> FWD + PAS --> FWD + PAD --> FWD + + style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + + +.. image:: diagrams/transforms_2.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_2.svg :alt: Diagram :align: center @@ -117,28 +133,36 @@ MergeTransform collapses multiple dimensions from the lower coordinate space int .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "MergeTransform: Multi-D → Linear" - LS["Lower Coordinate Space
2D: [4, 5]
Coord: (2, 3)"] - US["Upper Coordinate Space
1D Linear
Index: 13"] - - DATA["Same Tensor Data
Layout: row-major
Size: 20 elements"] - end - - LS -->|"Forward Transform
2×5 + 3 = 13"| US - US -->|"Inverse Transform
13÷5=2, 13%5=3"| LS - - DATA -.->|"Multi-dimensional
view"| LS - DATA -.->|"Linear
view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "MergeTransform: Multi-D → Linear" + LS["Lower Coordinate Space
2D: [4, 5]
Coord: (2, 3)"] + US["Upper Coordinate Space
1D Linear
Index: 13"] + + DATA["Same Tensor Data
Layout: row-major
Size: 20 elements"] + end + + LS -->|"Forward Transform
2×5 + 3 = 13"| US + US -->|"Inverse Transform
13÷5=2, 13%5=3"| LS + + DATA -.->|"Multi-dimensional
view"| LS + DATA -.->|"Linear
view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_3.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_3.svg :alt: Diagram :align: center @@ -180,28 +204,36 @@ UnmergeTransform expands coordinates from a single dimension in the lower coordi .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "UnmergeTransform: Linear → Multi-D" - LS["Lower Coordinate Space
1D Linear
Index: 14"] - US["Upper Coordinate Space
3D: [3, 4, 2]
Coord: (1, 3, 0)"] - - DATA["Same Tensor Data
Layout: row-major
Size: 24 elements"] - end - - LS -->|"Forward Transform
14 = 1×8 + 3×2 + 0"| US - US -->|"Inverse Transform
linearize back"| LS - - DATA -.->|"Linear
view"| LS - DATA -.->|"Multi-dimensional
view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "UnmergeTransform: Linear → Multi-D" + LS["Lower Coordinate Space
1D Linear
Index: 14"] + US["Upper Coordinate Space
3D: [3, 4, 2]
Coord: (1, 3, 0)"] + + DATA["Same Tensor Data
Layout: row-major
Size: 24 elements"] + end + + LS -->|"Forward Transform
14 = 1×8 + 3×2 + 0"| US + US -->|"Inverse Transform
linearize back"| LS + + DATA -.->|"Linear
view"| LS + DATA -.->|"Multi-dimensional
view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_4.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_4.svg :alt: Diagram :align: center @@ -253,28 +285,36 @@ EmbedTransform expands linear indices from the lower coordinate space into multi .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "EmbedTransform: Linear → Multi-D Strided" - LS["Lower Coordinate Space
1D Linear
Index: 14"] - US["Upper Coordinate Space
2D: [2, 3]
Coord: (1, 2)"] - - DATA["Linear Buffer in Memory"] - end - - LS -->|"Forward Transform
Strides: [12, 1]
14 ÷ 12 = 1, 14 % 12 = 2"| US - US -->|"Inverse Transform
1×12 + 2×1 = 14"| LS - - DATA -.->|"Linear
index view"| LS - DATA -.->|"Multi-dimensional
strided view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "EmbedTransform: Linear → Multi-D Strided" + LS["Lower Coordinate Space
1D Linear
Index: 14"] + US["Upper Coordinate Space
2D: [2, 3]
Coord: (1, 2)"] + + DATA["Linear Buffer in Memory"] + end + + LS -->|"Forward Transform
Strides: [12, 1]
14 ÷ 12 = 1, 14 % 12 = 2"| US + US -->|"Inverse Transform
1×12 + 2×1 = 14"| LS + + DATA -.->|"Linear
index view"| LS + DATA -.->|"Multi-dimensional
strided view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_5.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_5.svg :alt: Diagram :align: center @@ -315,28 +355,36 @@ ReplicateTransform creates a higher-dimensional tensor by replicating (broadcast .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "ReplicateTransform: 0D → Multi-D Broadcasting" - LS["Lower Coordinate Space
0D: Scalar
Empty coordinate []"] - US["Upper Coordinate Space
2D: [3, 4]
All coords: (i, j)"] - - DATA["Single Scalar Value"] - end - - LS -->|"Forward Transform
[] → (i,j) for any i,j"| US - US -->|"Inverse Transform
(i,j) → [] for any i,j"| LS - - DATA -.->|"One scalar
value"| LS - DATA -.->|"Broadcasted view
at all positions"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "ReplicateTransform: 0D → Multi-D Broadcasting" + LS["Lower Coordinate Space
0D: Scalar
Empty coordinate []"] + US["Upper Coordinate Space
2D: [3, 4]
All coords: (i, j)"] + + DATA["Single Scalar Value"] + end + + LS -->|"Forward Transform
[] → (i,j) for any i,j"| US + US -->|"Inverse Transform
(i,j) → [] for any i,j"| LS + + DATA -.->|"One scalar
value"| LS + DATA -.->|"Broadcasted view
at all positions"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_6.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_6.svg :alt: Diagram :align: center @@ -388,28 +436,36 @@ OffsetTransform shifts coordinates by a fixed offset, creating a translated view .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "OffsetTransform: 1D → 1D Translation" - LS["Lower Coordinate Space
1D: [0, 63]
Coord: index + offset"] - US["Upper Coordinate Space
1D: [0, 47]
Coord: index"] - - DATA["Linear Buffer in Memory"] - end - - LS -->|"Forward Transform
idx → idx + 16"| US - US -->|"Inverse Transform
idx + 16 → idx"| LS - - DATA -.->|"Lower
view"| LS - DATA -.->|"Upper
view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "OffsetTransform: 1D → 1D Translation" + LS["Lower Coordinate Space
1D: [0, 63]
Coord: index + offset"] + US["Upper Coordinate Space
1D: [0, 47]
Coord: index"] + + DATA["Linear Buffer in Memory"] + end + + LS -->|"Forward Transform
idx → idx + 16"| US + US -->|"Inverse Transform
idx + 16 → idx"| LS + + DATA -.->|"Lower
view"| LS + DATA -.->|"Upper
view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_7.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_7.svg :alt: Diagram :align: center @@ -462,28 +518,36 @@ No-op transform that passes coordinates unchanged. The PassThrough transform is .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "PassThroughTransform: 1D → 1D Identity" - LS["Lower Coordinate Space
1D: [0, 59]
Coord: index"] - US["Upper Coordinate Space
1D: [0, 59]
Coord: index"] - - DATA["Linear Buffer in Memory"] - end - - LS -.->|"Perfect Identity
idx → idx"| US - US -.->|"Perfect Identity
idx → idx"| LS - - DATA -->|"Same buffer
same view"| LS - DATA -->|"Same buffer
same view"| US - - style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px - style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "PassThroughTransform: 1D → 1D Identity" + LS["Lower Coordinate Space
1D: [0, 59]
Coord: index"] + US["Upper Coordinate Space
1D: [0, 59]
Coord: index"] + + DATA["Linear Buffer in Memory"] + end + + LS -.->|"Perfect Identity
idx → idx"| US + US -.->|"Perfect Identity
idx → idx"| LS + + DATA -->|"Same buffer
same view"| LS + DATA -->|"Same buffer
same view"| US + + style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px + style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_8.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_8.svg :alt: Diagram :align: center @@ -531,28 +595,36 @@ PadTransform adds padding to tensor dimensions, mapping coordinates from upper d .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "PadTransform: 1D → 1D with Padding" - LS["Lower Coordinate Space
1D: [0, 2] (original data)"] - US["Upper Coordinate Space
1D: [0, 4] (with padding)"] - - DATA["Tensor Data in Memory"] - end - - LS -->|"Forward Transform
idx + left_pad"| US - US -->|"Inverse Transform
idx - left_pad"| LS - - DATA -.->|"Original view"| LS - DATA -.->|"Padded view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "PadTransform: 1D → 1D with Padding" + LS["Lower Coordinate Space
1D: [0, 2] (original data)"] + US["Upper Coordinate Space
1D: [0, 4] (with padding)"] + + DATA["Tensor Data in Memory"] + end + + LS -->|"Forward Transform
idx + left_pad"| US + US -->|"Inverse Transform
idx - left_pad"| LS + + DATA -.->|"Original view"| LS + DATA -.->|"Padded view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_9.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_9.svg :alt: Diagram :align: center @@ -603,28 +675,36 @@ XorTransform applies a 2D XOR mapping for specialized memory access patterns. It .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "XorTransform: 2D → 2D XOR Mapping" - LS["Lower Coordinate Space
2D: [4, 8]
XOR-transformed coords"] - US["Upper Coordinate Space
2D: [4, 8]
Normal coords"] - - DATA["Same Tensor Data"] - end - - LS -->|"Forward Transform
apply XOR reverse"| US - US -->|"Inverse Transform
apply XOR mapping"| LS - - DATA -.->|"XOR pattern
view"| LS - DATA -.->|"Normal
view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "XorTransform: 2D → 2D XOR Mapping" + LS["Lower Coordinate Space
2D: [4, 8]
XOR-transformed coords"] + US["Upper Coordinate Space
2D: [4, 8]
Normal coords"] + + DATA["Same Tensor Data"] + end + + LS -->|"Forward Transform
apply XOR reverse"| US + US -->|"Inverse Transform
apply XOR mapping"| LS + + DATA -.->|"XOR pattern
view"| LS + DATA -.->|"Normal
view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_10.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_10.svg :alt: Diagram :align: center @@ -636,28 +716,36 @@ SliceTransform extracts a sub-region from a tensor dimension. .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "SliceTransform: 1D → 1D Sub-region" - LS["Lower Coordinate Space
1D: [0, 9] (original range)"] - US["Upper Coordinate Space
1D: [0, 4] (slice range)"] - - DATA["Tensor Data in Memory"] - end - - LS -->|"Forward Transform
idx + slice_begin"| US - US -->|"Inverse Transform
idx - slice_begin"| LS - - DATA -.->|"Full tensor
view"| LS - DATA -.->|"Sub-region
view"| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "SliceTransform: 1D → 1D Sub-region" + LS["Lower Coordinate Space
1D: [0, 9] (original range)"] + US["Upper Coordinate Space
1D: [0, 4] (slice range)"] + + DATA["Tensor Data in Memory"] + end + + LS -->|"Forward Transform
idx + slice_begin"| US + US -->|"Inverse Transform
idx - slice_begin"| LS + + DATA -.->|"Full tensor
view"| LS + DATA -.->|"Sub-region
view"| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_11.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_11.svg :alt: Diagram :align: center @@ -669,28 +757,36 @@ ModuloTransform applies cyclic wrapping to coordinates using modulo operations. .. Original mermaid diagram (edit here, then run update_diagrams.py) - .. mermaid:: +.. + Original mermaid diagram (edit here, then run update_diagrams.py) - graph TB - subgraph "ModuloTransform: 1D → 1D Cyclic" - LS["Lower Coordinate Space
1D: [0, 3] (modulus range)"] - US["Upper Coordinate Space
1D: [0, 15] (full range)"] - - DATA["Tensor Data in Memory"] - end - - LS -->|"Forward Transform
idx * cycle_count"| US - US -->|"Inverse Transform
idx % modulus"| LS - - DATA -.->|" "| LS - DATA -.->|" "| US - - style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px - style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px - style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + .. mermaid:: + + graph TB + subgraph "ModuloTransform: 1D → 1D Cyclic" + LS["Lower Coordinate Space
1D: [0, 3] (modulus range)"] + US["Upper Coordinate Space
1D: [0, 15] (full range)"] + + DATA["Tensor Data in Memory"] + end + + LS -->|"Forward Transform
idx * cycle_count"| US + US -->|"Inverse Transform
idx % modulus"| LS + + DATA -.->|" "| LS + DATA -.->|" "| US + + style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px + style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px + style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5 + + +.. image:: diagrams/transforms_12.svg + :alt: Diagram + :align: center .. image:: diagrams/transforms_12.svg :alt: Diagram :align: center