Fix remaining images

This commit is contained in:
Vidyasagar
2025-10-14 07:09:40 -07:00
parent c80df7a092
commit c72bd792c8
44 changed files with 2164 additions and 1530 deletions

View File

@@ -18,35 +18,43 @@ A TensorAdaptor encapsulates a sequence of :ref:`coordinate transformations <ck_
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph LR
subgraph "Adaptor Composition"
subgraph "Single Transform"
direction TB
I1["Input Coords<br/>[0,1,2]"]
T1["Transform<br/>(e.g., Transpose)"]
O1["Output Coords<br/>[2,0,1]"]
I1 --> T1 --> O1
end
subgraph "Chained Transforms"
direction TB
I2["Input<br/>2D"]
T2A["Transform A<br/>(e.g., Merge)"]
M2["Intermediate<br/>1D"]
T2B["Transform B<br/>(e.g., Pad)"]
O2["Output<br/>1D Padded"]
I2 --> T2A --> M2 --> T2B --> O2
end
end
style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. mermaid::
graph LR
subgraph "Adaptor Composition"
subgraph "Single Transform"
direction TB
I1["Input Coords<br/>[0,1,2]"]
T1["Transform<br/>(e.g., Transpose)"]
O1["Output Coords<br/>[2,0,1]"]
I1 --> T1 --> O1
end
subgraph "Chained Transforms"
direction TB
I2["Input<br/>2D"]
T2A["Transform A<br/>(e.g., Merge)"]
M2["Intermediate<br/>1D"]
T2B["Transform B<br/>(e.g., Pad)"]
O2["Output<br/>1D Padded"]
I2 --> T2A --> M2 --> T2B --> O2
end
end
style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/adaptors_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/adaptors_1.svg
:alt: Diagram
:align: center
@@ -118,44 +126,52 @@ The real power of adaptors comes from chaining multiple transformations together
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph LR
subgraph "Adaptor Chaining Flow"
subgraph "Adaptor 1"
A1I["Bottom Dims<br/>[0,1]"]
A1T["Transform:<br/>Merge[2,3]"]
A1O["Top Dims<br/>[0]"]
end
subgraph "Adaptor 2"
A2I["Bottom Dims<br/>[0]"]
A2T["Transform:<br/>Unmerge[2,3]"]
A2O["Top Dims<br/>[0,1]"]
end
subgraph "Chained Result"
CI["Input 2D<br/>Bottom[0,1]"]
CO["Output 2D<br/>Top[0,1]"]
end
end
A1I --> A1T
A1T --> A1O
A1O --> A2I
A2I --> A2T
A2T --> A2O
CI --> A1I
A2O --> CO
style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. mermaid::
graph LR
subgraph "Adaptor Chaining Flow"
subgraph "Adaptor 1"
A1I["Bottom Dims<br/>[0,1]"]
A1T["Transform:<br/>Merge[2,3]"]
A1O["Top Dims<br/>[0]"]
end
subgraph "Adaptor 2"
A2I["Bottom Dims<br/>[0]"]
A2T["Transform:<br/>Unmerge[2,3]"]
A2O["Top Dims<br/>[0,1]"]
end
subgraph "Chained Result"
CI["Input 2D<br/>Bottom[0,1]"]
CO["Output 2D<br/>Top[0,1]"]
end
end
A1I --> A1T
A1T --> A1O
A1O --> A2I
A2I --> A2T
A2T --> A2O
CI --> A1I
A2O --> CO
style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/adaptors_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/adaptors_2.svg
:alt: Diagram
:align: center

View File

@@ -23,40 +23,52 @@ Memory coherence and caching policies represent another layer of complexity that
Address Space Usage Patterns
----------------------------
.. raw:: html
<div class="mermaid">
flowchart TB
subgraph CF ["Compute Flow"]
direction LR
GM1["Global Memory<br/>Input Data"] --> LDS["LDS<br/>Tile Cache"]
LDS --> VGPR["VGPR<br/>Working Set"]
VGPR --> Compute["Compute<br/>Operations"]
Compute --> VGPR
VGPR --> LDS2["LDS<br/>Reduction"]
LDS2 --> GM2["Global Memory<br/>Output Data"]
end
subgraph UP ["Usage Pattern"]
direction LR
P1["1. Load tile from Global → LDS"]
P2["2. Load working set LDS → VGPR"]
P3["3. Compute in VGPR"]
P4["4. Store results VGPR → LDS"]
P5["5. Reduce in LDS"]
P6["6. Write final LDS → Global"]
P1 --> P2 --> P3 --> P4 --> P5 --> P6
end
CF ~~~ UP
style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart TB
subgraph CF ["Compute Flow"]
direction LR
GM1["Global Memory<br/>Input Data"] --> LDS["LDS<br/>Tile Cache"]
LDS --> VGPR["VGPR<br/>Working Set"]
VGPR --> Compute["Compute<br/>Operations"]
Compute --> VGPR
VGPR --> LDS2["LDS<br/>Reduction"]
LDS2 --> GM2["Global Memory<br/>Output Data"]
end
subgraph UP ["Usage Pattern"]
direction LR
P1["1. Load tile from Global → LDS"]
P2["2. Load working set LDS → VGPR"]
P3["3. Compute in VGPR"]
P4["4. Store results VGPR → LDS"]
P5["5. Reduce in LDS"]
P6["6. Write final LDS → Global"]
P1 --> P2 --> P3 --> P4 --> P5 --> P6
end
CF ~~~ UP
style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
.. image:: diagrams/buffer_views_1.svg
:alt: Diagram
:align: center
C++ Implementation
------------------
@@ -176,76 +188,100 @@ The implementation of vector access maintains the same parameter structure as sc
Scalar vs Vectorized Memory Access
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
graph LR
subgraph "Scalar Access (4 instructions)"
S1["Load float[0]"] --> R1["Register 1"]
S2["Load float[1]"] --> R2["Register 2"]
S3["Load float[2]"] --> R3["Register 3"]
S4["Load float[3]"] --> R4["Register 4"]
end
subgraph "Vectorized Access (1 instruction)"
V1["Load float4[0]"] --> VR["Vector Register<br/>(4 floats)"]
end
subgraph "Performance Impact"
Perf["4x fewer instructions<br/>Better memory bandwidth<br/>Reduced latency"]
end
R1 & R2 & R3 & R4 --> Perf
VR --> Perf
style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Scalar Access (4 instructions)"
S1["Load float[0]"] --> R1["Register 1"]
S2["Load float[1]"] --> R2["Register 2"]
S3["Load float[2]"] --> R3["Register 3"]
S4["Load float[3]"] --> R4["Register 4"]
end
subgraph "Vectorized Access (1 instruction)"
V1["Load float4[0]"] --> VR["Vector Register<br/>(4 floats)"]
end
subgraph "Performance Impact"
Perf["4x fewer instructions<br/>Better memory bandwidth<br/>Reduced latency"]
end
R1 & R2 & R3 & R4 --> Perf
VR --> Perf
style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
.. image:: diagrams/buffer_views_2.svg
:alt: Diagram
:align: center
Understanding BufferView Indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
flowchart LR
subgraph "Input Parameters"
Offset["Offset<br/>(e.g., 5)"]
ValidFlag["Valid Flag<br/>(optional)"]
end
subgraph "Processing"
BoundsCheck{{"Bounds Check<br/>offset < buffer_size?"}}
FlagCheck{{"Flag Check<br/>valid_flag == True?"}}
Access["Access Memory<br/>buffer[offset]"]
end
subgraph "Output"
ValidResult["Valid Result<br/>Return value"]
Invalid["Invalid Result<br/>Return 0 or default"]
end
Offset --> BoundsCheck
ValidFlag --> FlagCheck
BoundsCheck -->|Yes| FlagCheck
BoundsCheck -->|No| Invalid
FlagCheck -->|Yes| Access
FlagCheck -->|No| Invalid
Access --> ValidResult
style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart LR
subgraph "Input Parameters"
Offset["Offset<br/>(e.g., 5)"]
ValidFlag["Valid Flag<br/>(optional)"]
end
subgraph "Processing"
BoundsCheck{{"Bounds Check<br/>offset < buffer_size?"}}
FlagCheck{{"Flag Check<br/>valid_flag == True?"}}
Access["Access Memory<br/>buffer[offset]"]
end
subgraph "Output"
ValidResult["Valid Result<br/>Return value"]
Invalid["Invalid Result<br/>Return 0 or default"]
end
Offset --> BoundsCheck
ValidFlag --> FlagCheck
BoundsCheck -->|Yes| FlagCheck
BoundsCheck -->|No| Invalid
FlagCheck -->|Yes| Access
FlagCheck -->|No| Invalid
Access --> ValidResult
style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px
.. image:: diagrams/buffer_views_3.svg
:alt: Diagram
:align: center
C++ Get Operations
~~~~~~~~~~~~~~~~~~
@@ -341,28 +377,40 @@ Atomic Operations
Atomic vs Non-Atomic Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Non-Atomic Operation (Race Condition)"
NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
NA2 --> NA5["Thread 1: Write 15"]
NA4 --> NA6["Thread 2: Write 13"]
NA5 & NA6 --> NA7["Final value: 13 ❌<br/>(Lost update from Thread 1)"]
end
subgraph "Atomic Operation (Thread-Safe)"
A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures<br/>serialization"]
A3["Thread 2: atomic_add(3)"] --> A2
A2 --> A4["Final value: 18 ✓<br/>(Both updates applied)"]
end
style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Non-Atomic Operation (Race Condition)"
NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
NA2 --> NA5["Thread 1: Write 15"]
NA4 --> NA6["Thread 2: Write 13"]
NA5 & NA6 --> NA7["Final value: 13 ❌<br/>(Lost update from Thread 1)"]
end
subgraph "Atomic Operation (Thread-Safe)"
A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures<br/>serialization"]
A3["Thread 2: atomic_add(3)"] --> A2
A2 --> A4["Final value: 18 ✓<br/>(Both updates applied)"]
end
style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/buffer_views_4.svg
:alt: Diagram
:align: center
C++ Atomic Operations
~~~~~~~~~~~~~~~~~~~~~

View File

@@ -31,11 +31,16 @@ RST_FILES = [
'descriptors.rst',
'coordinate_movement.rst',
'adaptors.rst',
'introduction_motivation.rst',
'buffer_views.rst',
'tensor_views.rst',
'coordinate_systems.rst',
'tile_distribution.rst',
]
# Pattern to find mermaid blocks
# Pattern to find mermaid blocks (can be indented with 3 spaces for commented blocks)
MERMAID_PATTERN = re.compile(
r'^\.\. mermaid::\s*\n((?:(?:\n| .*))*)',
r'^(?: )?\.\. mermaid::\s*\n((?:(?:\n| .*))*)',
re.MULTILINE
)

View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python3
"""Convert raw HTML mermaid blocks to commented format for SVG conversion."""
import os
import re
def convert_raw_html_to_commented(content):
"""Convert raw HTML mermaid blocks to commented mermaid format."""
# Pattern to match raw HTML mermaid blocks
pattern = r'\.\. raw:: html\n\n <div class="mermaid"[^>]*>\n(.*?)\n </div>'
def replace_block(match):
mermaid_code = match.group(1)
# The mermaid code in HTML has 3-space indentation, keep it
# but add 3 more spaces for .. mermaid:: indentation
mermaid_lines = mermaid_code.split('\n')
properly_indented = []
for line in mermaid_lines:
if line.strip(): # Non-empty line
# Line already has 3 spaces from HTML, add 3 more for mermaid block
properly_indented.append(' ' + line)
else:
properly_indented.append('')
indented_code = '\n'.join(properly_indented)
# Create commented format matching the expected pattern
commented = f"""..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
{indented_code}
"""
return commented
return re.sub(pattern, replace_block, content, flags=re.DOTALL)
def main():
"""Process files with raw HTML mermaid blocks."""
files_to_convert = [
'introduction_motivation.rst',
'buffer_views.rst',
'tensor_views.rst',
'coordinate_systems.rst',
'tile_distribution.rst'
]
converted_files = []
for filename in files_to_convert:
if not os.path.exists(filename):
print(f'Skipping {filename} - not found')
continue
with open(filename, 'r', encoding='utf-8') as f:
original = f.read()
converted = convert_raw_html_to_commented(original)
if converted != original:
with open(filename, 'w', encoding='utf-8') as f:
f.write(converted)
blocks_converted = original.count('.. raw:: html')
converted_files.append((filename, blocks_converted))
print(f'✓ Converted {filename}: {blocks_converted} blocks')
else:
print(f' {filename}: no raw HTML blocks found')
print(f'\n=== CONVERSION COMPLETE ===')
print(f'Files converted: {len(converted_files)}')
print(f'Total blocks: {sum(c for _, c in converted_files)}')
print('\nNext: Run convert_mermaid_to_svg.py to generate SVG files')
if __name__ == '__main__':
main()

View File

@@ -18,42 +18,50 @@ The key insight is that convolution can be transformed from a complex nested loo
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Convolution Process"
I["Input Image<br/>6×6"]
K["Kernel<br/>3×3"]
SW["Sliding Window<br/>Extract 3×3 patches"]
DP["Dot Product<br/>Element-wise multiply & sum"]
O["Output<br/>4×4"]
end
subgraph "Im2col Optimization"
W["Windows Matrix<br/>16×9<br/>(all patches)"]
KF["Kernel Flattened<br/>9×1"]
MM["Matrix Multiply<br/>W @ K"]
OF["Output Flattened<br/>16×1"]
end
I --> SW
K --> DP
SW --> DP
DP --> O
SW --> W
K --> KF
W --> MM
KF --> MM
MM --> OF
OF --> O
style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. mermaid::
graph TB
subgraph "Convolution Process"
I["Input Image<br/>6×6"]
K["Kernel<br/>3×3"]
SW["Sliding Window<br/>Extract 3×3 patches"]
DP["Dot Product<br/>Element-wise multiply & sum"]
O["Output<br/>4×4"]
end
subgraph "Im2col Optimization"
W["Windows Matrix<br/>16×9<br/>(all patches)"]
KF["Kernel Flattened<br/>9×1"]
MM["Matrix Multiply<br/>W @ K"]
OF["Output Flattened<br/>16×1"]
end
I --> SW
K --> DP
SW --> DP
DP --> O
SW --> W
K --> KF
W --> MM
KF --> MM
MM --> OF
OF --> O
style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center

View File

@@ -20,36 +20,44 @@ For the mathematical foundations of coordinate systems, see :ref:`ck_tile_coordi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Coordinate Movement System"
TC["TensorCoordinate<br/>Position + Descriptor Context"]
TAC["TensorAdaptorCoordinate<br/>Position + Transform Context"]
MC["move_coordinate()<br/>Efficient Navigation"]
end
subgraph "Movement Example"
S["Start: [1,1]<br/>Offset: 5"]
M1["Move [0,1]<br/>→ [1,2]<br/>Offset: 6"]
M2["Move [1,0]<br/>→ [2,2]<br/>Offset: 10"]
M3["Move [1,1]<br/>→ [3,3]<br/>Offset: 15"]
end
TC --> MC
TAC --> MC
S --> M1
M1 --> M2
M2 --> M3
style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Coordinate Movement System"
TC["TensorCoordinate<br/>Position + Descriptor Context"]
TAC["TensorAdaptorCoordinate<br/>Position + Transform Context"]
MC["move_coordinate()<br/>Efficient Navigation"]
end
subgraph "Movement Example"
S["Start: [1,1]<br/>Offset: 5"]
M1["Move [0,1]<br/>→ [1,2]<br/>Offset: 6"]
M2["Move [1,0]<br/>→ [2,2]<br/>Offset: 10"]
M3["Move [1,1]<br/>→ [3,3]<br/>Offset: 15"]
end
TC --> MC
TAC --> MC
S --> M1
M1 --> M2
M2 --> M3
style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/coordinate_movement.svg
:alt: Diagram
:align: center
.. image:: diagrams/coordinate_movement.svg
:alt: Diagram
:align: center

View File

@@ -15,39 +15,51 @@ The Five Coordinate Spaces
The CK framework employs five interconnected coordinate spaces, each serving a specific purpose in the journey from thread identification to memory access. These spaces work together to solve the fundamental challenge of GPU programming: efficiently distributing work across thousands of parallel threads while maintaining optimal memory access patterns.
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Coordinate Spaces Overview"
P["P-space<br/>Thread Identification<br/>Which thread am I?"]
Y["Y-space<br/>Logical Tile<br/>Which element in my tile?"]
X["X-space<br/>Physical Tensor<br/>Where in the tensor?"]
R["R-space<br/>Replication<br/>Data sharing pattern"]
D["D-space<br/>Linear Storage<br/>Memory address"]
end
subgraph "Transformations"
T1["P + Y → X<br/>Thread + Element → Position"]
T2["X → D<br/>Position → Address"]
end
P --> T1
Y --> T1
T1 --> X
X --> T2
T2 --> D
R -.-> P
R -.-> Y
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Coordinate Spaces Overview"
P["P-space<br/>Thread Identification<br/>Which thread am I?"]
Y["Y-space<br/>Logical Tile<br/>Which element in my tile?"]
X["X-space<br/>Physical Tensor<br/>Where in the tensor?"]
R["R-space<br/>Replication<br/>Data sharing pattern"]
D["D-space<br/>Linear Storage<br/>Memory address"]
end
subgraph "Transformations"
T1["P + Y → X<br/>Thread + Element → Position"]
T2["X → D<br/>Position → Address"]
end
P --> T1
Y --> T1
T1 --> X
X --> T2
T2 --> D
R -.-> P
R -.-> Y
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
.. image:: diagrams/coordinate_systems_1.svg
:alt: Diagram
:align: center
The Challenge and Solution
~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -69,41 +81,53 @@ P-space (Partition Space) represents the foundation of the coordinate system hie
GPU Thread Hierarchy
~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
graph TB
subgraph "GPU Thread Hierarchy"
subgraph "Block"
subgraph "Warp 0"
T0["Thread 0<br/>P=[0,0]"]
T1["Thread 1<br/>P=[0,1]"]
T2["Thread 2<br/>P=[0,2]"]
T31["..."]
T3["Thread 31<br/>P=[0,31]"]
end
subgraph "Warp 1"
T32["Thread 32<br/>P=[1,0]"]
T33["Thread 33<br/>P=[1,1]"]
T34["..."]
T63["Thread 63<br/>P=[1,31]"]
end
W2["Warp 2..."]
W7["Warp 7"]
end
end
subgraph "P-space Mapping"
PM["P-coordinates = [warp_id, lane_id]<br/>or<br/>P-coordinates = [block_x, block_y, thread_x, thread_y]"]
end
T0 --> PM
T32 --> PM
style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "GPU Thread Hierarchy"
subgraph "Block"
subgraph "Warp 0"
T0["Thread 0<br/>P=[0,0]"]
T1["Thread 1<br/>P=[0,1]"]
T2["Thread 2<br/>P=[0,2]"]
T31["..."]
T3["Thread 31<br/>P=[0,31]"]
end
subgraph "Warp 1"
T32["Thread 32<br/>P=[1,0]"]
T33["Thread 33<br/>P=[1,1]"]
T34["..."]
T63["Thread 63<br/>P=[1,31]"]
end
W2["Warp 2..."]
W7["Warp 7"]
end
end
subgraph "P-space Mapping"
PM["P-coordinates = [warp_id, lane_id]<br/>or<br/>P-coordinates = [block_x, block_y, thread_x, thread_y]"]
end
T0 --> PM
T32 --> PM
style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
.. image:: diagrams/coordinate_systems_2.svg
:alt: Diagram
:align: center
The structure of P-space directly reflects the :ref:`hardware organization <ck_tile_gpu_basics>` of modern GPUs. Each thread receives a unique P-coordinate that encodes its position within the execution hierarchy. For simple distributions, P-space might be one-dimensional, containing only a thread ID. For complex hierarchical distributions, P-space can have multiple dimensions representing different levels of the GPU's thread organization.
C++ Implementation
@@ -147,44 +171,56 @@ Y-space (Yield Space) represents the logical organization of work within each th
Work Assignment Structure
~~~~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Thread's Tile (2x2 elements)"
Y00["Y=[0,0]<br/>Element 0"]
Y01["Y=[0,1]<br/>Element 1"]
Y10["Y=[1,0]<br/>Element 2"]
Y11["Y=[1,1]<br/>Element 3"]
end
subgraph "Y-space Structure"
YS["Each thread processes<br/>the same Y-space pattern<br/>but at different X locations"]
end
subgraph "Example: 4 Threads"
T0["Thread 0<br/>P=[0,0]"]
T1["Thread 1<br/>P=[0,1]"]
T2["Thread 2<br/>P=[1,0]"]
T3["Thread 3<br/>P=[1,1]"]
end
Y00 --> YS
Y01 --> YS
Y10 --> YS
Y11 --> YS
T0 --> YS
T1 --> YS
T2 --> YS
T3 --> YS
style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Thread's Tile (2x2 elements)"
Y00["Y=[0,0]<br/>Element 0"]
Y01["Y=[0,1]<br/>Element 1"]
Y10["Y=[1,0]<br/>Element 2"]
Y11["Y=[1,1]<br/>Element 3"]
end
subgraph "Y-space Structure"
YS["Each thread processes<br/>the same Y-space pattern<br/>but at different X locations"]
end
subgraph "Example: 4 Threads"
T0["Thread 0<br/>P=[0,0]"]
T1["Thread 1<br/>P=[0,1]"]
T2["Thread 2<br/>P=[1,0]"]
T3["Thread 3<br/>P=[1,1]"]
end
Y00 --> YS
Y01 --> YS
Y10 --> YS
Y11 --> YS
T0 --> YS
T1 --> YS
T2 --> YS
T3 --> YS
style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/coordinate_systems_3.svg
:alt: Diagram
:align: center
The power of Y-space lies in its ability to express different iteration patterns without changing the underlying distribution logic. A thread might traverse its Y-space in row-major order for one algorithm, column-major for another, or even use :ref:`space-filling curves <ck_tile_space_filling_curve>` for optimal cache utilization. This flexibility enables algorithm-specific optimizations while maintaining a consistent framework.
Hierarchical Y-Space
@@ -265,37 +301,49 @@ The transformation from P and Y coordinates to X coordinates represents the hear
Transformation Pipeline
~~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
graph LR
subgraph "Input"
P["P-coordinates<br/>Thread identity<br/>P=[1,0]"]
Y["Y-coordinates<br/>Element in tile<br/>Y=[0,1]"]
end
subgraph "Transformation"
T["P + Y → X<br/>Base position + Offset"]
end
subgraph "Output"
X["X-coordinates<br/>Tensor position<br/>X=[2,1]"]
end
subgraph "Example"
E["Thread P=[1,0] at base (2,0)<br/>Element Y=[0,1] adds offset (0,1)<br/>Result X=[2,1] in tensor"]
end
P --> T
Y --> T
T --> X
X --> E
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Input"
P["P-coordinates<br/>Thread identity<br/>P=[1,0]"]
Y["Y-coordinates<br/>Element in tile<br/>Y=[0,1]"]
end
subgraph "Transformation"
T["P + Y → X<br/>Base position + Offset"]
end
subgraph "Output"
X["X-coordinates<br/>Tensor position<br/>X=[2,1]"]
end
subgraph "Example"
E["Thread P=[1,0] at base (2,0)<br/>Element Y=[0,1] adds offset (0,1)<br/>Result X=[2,1] in tensor"]
end
P --> T
Y --> T
T --> X
X --> E
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/coordinate_systems_4.svg
:alt: Diagram
:align: center
Mathematical Foundation
~~~~~~~~~~~~~~~~~~~~~~~
@@ -361,33 +409,45 @@ D-space represents the final transformation in the coordinate pipeline—convert
Linearization Strategies
~~~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html
<div class="mermaid">
graph LR
subgraph "X-coordinates"
X["X = [2, 3]<br/>2D Position"]
end
subgraph "Layout Options"
RM["Row-Major<br/>D = 2×width + 3"]
CM["Column-Major<br/>D = 3×height + 2"]
BL["Blocked<br/>Complex pattern"]
end
subgraph "D-coordinate"
D["D = 11<br/>Linear Address"]
end
X --> RM
X --> CM
X --> BL
RM --> D
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "X-coordinates"
X["X = [2, 3]<br/>2D Position"]
end
subgraph "Layout Options"
RM["Row-Major<br/>D = 2×width + 3"]
CM["Column-Major<br/>D = 3×height + 2"]
BL["Blocked<br/>Complex pattern"]
end
subgraph "D-coordinate"
D["D = 11<br/>Linear Address"]
end
X --> RM
X --> CM
X --> BL
RM --> D
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
.. image:: diagrams/coordinate_systems_5.svg
:alt: Diagram
:align: center
The linearization process must consider multiple factors:
.. code-block:: cpp
@@ -417,48 +477,60 @@ Complete Pipeline Example
Let's trace through a complete example showing how all coordinate spaces work together:
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Step 1: Thread Identification"
TID["Thread ID = 5"]
P["P-coordinates<br/>P = [0, 5]<br/>(warp 0, lane 5)"]
end
subgraph "Step 2: Work Assignment"
Y["Y-coordinates<br/>Y = [1, 0]<br/>(element in tile)"]
end
subgraph "Step 3: P+Y Transformation"
TRANS["P + Y → X<br/>Thread position + Element offset"]
X["X-coordinates<br/>X = [1, 5]<br/>(tensor position)"]
end
subgraph "Step 4: Linearization"
LIN["XD<br/>Row-major: D = x₀ × width + x₁"]
D["D-coordinate<br/>D = 13<br/>(memory address)"]
end
subgraph "Step 5: Memory Access"
MEM["Hardware accesses<br/>memory[13]"]
end
TID --> P
P --> TRANS
Y --> TRANS
TRANS --> X
X --> LIN
LIN --> D
D --> MEM
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Step 1: Thread Identification"
TID["Thread ID = 5"]
P["P-coordinates<br/>P = [0, 5]<br/>(warp 0, lane 5)"]
end
subgraph "Step 2: Work Assignment"
Y["Y-coordinates<br/>Y = [1, 0]<br/>(element in tile)"]
end
subgraph "Step 3: P+Y Transformation"
TRANS["P + YX<br/>Thread position + Element offset"]
X["X-coordinates<br/>X = [1, 5]<br/>(tensor position)"]
end
subgraph "Step 4: Linearization"
LIN["X → D<br/>Row-major: D = x₀ × width + x₁"]
D["D-coordinate<br/>D = 13<br/>(memory address)"]
end
subgraph "Step 5: Memory Access"
MEM["Hardware accesses<br/>memory[13]"]
end
TID --> P
P --> TRANS
Y --> TRANS
TRANS --> X
X --> LIN
LIN --> D
D --> MEM
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
.. image:: diagrams/coordinate_systems_6.svg
:alt: Diagram
:align: center
Real-World Example: Matrix Multiplication
-----------------------------------------

View File

@@ -99,35 +99,43 @@ Every TensorDescriptor in CK Tile can be thought of as a **transformation pipeli
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph LR
subgraph "Pipeline Stages"
S1["Stage 1<br/>Base Layout<br/>[M, N]"]
S2["Stage 2<br/>Transform<br/>Unmerge"]
S3["Stage 3<br/>New View<br/>[M1, M2, N]"]
S4["Stage N<br/>Final View<br/>[...]"]
end
subgraph "Same Data"
D["Physical Memory<br/>No data movement"]
end
S1 --> S2
S2 --> S3
S3 --> S4
S1 -.-> D
S2 -.-> D
S3 -.-> D
S4 -.-> D
style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. mermaid::
graph LR
subgraph "Pipeline Stages"
S1["Stage 1<br/>Base Layout<br/>[M, N]"]
S2["Stage 2<br/>Transform<br/>Unmerge"]
S3["Stage 3<br/>New View<br/>[M1, M2, N]"]
S4["Stage N<br/>Final View<br/>[...]"]
end
subgraph "Same Data"
D["Physical Memory<br/>No data movement"]
end
S1 --> S2
S2 --> S3
S3 --> S4
S1 -.-> D
S2 -.-> D
S3 -.-> D
S4 -.-> D
style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/descriptors_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/descriptors_1.svg
:alt: Diagram
:align: center
@@ -197,40 +205,48 @@ Analysis of the Final Pipeline
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Transform Pipeline"
T0["Transform 0<br/>Base Unmerge<br/>Input: [0]<br/>Output: [1,2]"]
T1["Transform 1<br/>PassThrough<br/>Input: [1]<br/>Output: [3]"]
T2["Transform 2<br/>Unmerge<br/>Input: [2]<br/>Output: [4,5]"]
end
subgraph "Hidden Dimensions"
H0["Hidden ID 0<br/>Raw Buffer"]
H1["Hidden ID 1<br/>Dim 0 (size 2)"]
H2["Hidden ID 2<br/>Dim 1 (size 6)"]
H3["Hidden ID 3<br/>Final Dim 0"]
H4["Hidden ID 4<br/>Final Dim 1"]
H5["Hidden ID 5<br/>Final Dim 2"]
end
H0 --> T0
T0 --> H1
T0 --> H2
H1 --> T1
H2 --> T2
T1 --> H3
T2 --> H4
T2 --> H5
style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. mermaid::
graph TB
subgraph "Transform Pipeline"
T0["Transform 0<br/>Base Unmerge<br/>Input: [0]<br/>Output: [1,2]"]
T1["Transform 1<br/>PassThrough<br/>Input: [1]<br/>Output: [3]"]
T2["Transform 2<br/>Unmerge<br/>Input: [2]<br/>Output: [4,5]"]
end
subgraph "Hidden Dimensions"
H0["Hidden ID 0<br/>Raw Buffer"]
H1["Hidden ID 1<br/>Dim 0 (size 2)"]
H2["Hidden ID 2<br/>Dim 1 (size 6)"]
H3["Hidden ID 3<br/>Final Dim 0"]
H4["Hidden ID 4<br/>Final Dim 1"]
H5["Hidden ID 5<br/>Final Dim 2"]
end
H0 --> T0
T0 --> H1
T0 --> H2
H1 --> T1
H2 --> T2
T1 --> H3
T2 --> H4
T2 --> H5
style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/descriptors_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/descriptors_2.svg
:alt: Diagram
:align: center

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 23 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 24 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 19 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 33 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -17,75 +17,81 @@ In this introduction, we establish the fundamental problems that tile distributi
The GPU Memory Problem
----------------------
.. raw:: html
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Random Access Pattern (Inefficient)"
subgraph "Threads"
T0_R["Thread 0"]
T1_R["Thread 1"]
T2_R["Thread 2"]
T3_R["Thread 3"]
end
<div class="mermaid" style="margin: 0 auto; display: block; width: 60%;">
graph TB
subgraph "Random Access Pattern (Inefficient)"
subgraph "Threads"
T0_R["Thread 0"]
T1_R["Thread 1"]
T2_R["Thread 2"]
T3_R["Thread 3"]
end
subgraph "Memory"
M0["Mem[0]"]
M7["Mem[7]"]
M15["Mem[15]"]
M23["Mem[23]"]
M31["Mem[31]"]
M39["Mem[39]"]
M47["Mem[47]"]
M55["Mem[55]"]
end
T0_R -.-> M23
T1_R -.-> M7
T2_R -.-> M47
T3_R -.-> M15
end
subgraph "Tile Distribution Pattern (Efficient)"
subgraph "Threads_TD"
T0_TD["Thread 0"]
T1_TD["Thread 1"]
T2_TD["Thread 2"]
T3_TD["Thread 3"]
end
subgraph "Memory_TD"
M0_TD["Mem[0]"]
M1_TD["Mem[1]"]
M2_TD["Mem[2]"]
M3_TD["Mem[3]"]
M4_TD["Mem[4]"]
M5_TD["Mem[5]"]
M6_TD["Mem[6]"]
M7_TD["Mem[7]"]
end
T0_TD --> M0_TD
T0_TD --> M1_TD
T1_TD --> M2_TD
T1_TD --> M3_TD
T2_TD --> M4_TD
T2_TD --> M5_TD
T3_TD --> M6_TD
T3_TD --> M7_TD
end
style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
</div>
subgraph "Memory"
M0["Mem[0]"]
M7["Mem[7]"]
M15["Mem[15]"]
M23["Mem[23]"]
M31["Mem[31]"]
M39["Mem[39]"]
M47["Mem[47]"]
M55["Mem[55]"]
end
T0_R -.-> M23
T1_R -.-> M7
T2_R -.-> M47
T3_R -.-> M15
end
subgraph "Tile Distribution Pattern (Efficient)"
subgraph "Threads_TD"
T0_TD["Thread 0"]
T1_TD["Thread 1"]
T2_TD["Thread 2"]
T3_TD["Thread 3"]
end
subgraph "Memory_TD"
M0_TD["Mem[0]"]
M1_TD["Mem[1]"]
M2_TD["Mem[2]"]
M3_TD["Mem[3]"]
M4_TD["Mem[4]"]
M5_TD["Mem[5]"]
M6_TD["Mem[6]"]
M7_TD["Mem[7]"]
end
T0_TD --> M0_TD
T0_TD --> M1_TD
T1_TD --> M2_TD
T1_TD --> M3_TD
T2_TD --> M4_TD
T2_TD --> M5_TD
T3_TD --> M6_TD
T3_TD --> M7_TD
end
style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/introduction_motivation_1.svg
:alt: Diagram
:align: center
Why Random Memory Access is Slow
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -209,36 +215,42 @@ The Coordinate Mapping Insight
At the heart of tile distribution lies a profound mathematical insight: efficient GPU computation requires a systematic framework for mapping between different coordinate spaces. This framework transforms the complex problem of thread-to-data assignment into a series of well-defined mathematical transformations, each serving a specific purpose in the journey from abstract algorithm to concrete hardware execution.
.. raw:: html
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Coordinate Spaces"
P["P-space<br/>Thread Position<br/>(thread_x, thread_y,<br/>warp_id, block_id)"]
Y["Y-space<br/>Local Data<br/>(y0, y1, y2, y3)"]
X["X-space<br/>Global Position<br/>(x0, x1)"]
D["D-space<br/>Memory Address<br/>(linearized)"]
end
<div class="mermaid">
graph LR
subgraph "Coordinate Spaces"
P["P-space<br/>Thread Position<br/>(thread_x, thread_y,<br/>warp_id, block_id)"]
Y["Y-space<br/>Local Data<br/>(y0, y1, y2, y3)"]
X["X-space<br/>Global Position<br/>(x0, x1)"]
D["D-space<br/>Memory Address<br/>(linearized)"]
end
subgraph "Transformations"
T1["P + Y → X<br/>Thread data mapping"]
T2["X → D<br/>Memory linearization"]
end
P --> T1
Y --> T1
T1 --> X
X --> T2
T2 --> D
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
</div>
subgraph "Transformations"
T1["P + Y → X<br/>Thread data mapping"]
T2["X → D<br/>Memory linearization"]
end
P --> T1
Y --> T1
T1 --> X
X --> T2
T2 --> D
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
.. image:: diagrams/introduction_motivation_2.svg
:alt: Diagram
:align: center
The elegance of this approach emerges from its separation of concerns. Each coordinate space represents a distinct aspect of the computation, and the transformations between them encapsulate specific optimization strategies. This separation allows developers to reason about their algorithms in natural terms while the framework handles the complex mapping to efficient hardware execution patterns.
**P-space (Thread Position Space)** represents the physical organization of threads on the GPU. This space captures the hierarchical nature of GPU execution, from individual threads identified by their x and y coordinates within a block, to warps that execute in lockstep, to thread blocks that share resources. The coordinates in P-space—thread_x, thread_y, warp_id, and block_id—directly correspond to the hardware's execution model. Understanding P-space is crucial because it determines which threads can cooperate efficiently through shared memory and which threads will execute their memory accesses simultaneously.

View File

@@ -28,42 +28,50 @@ The original K coordinate is split into K0 and K1, where K1 represents the threa
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "3D LDS coordinate [K0, M, K1]"
K0["KPerBlock/KPack * MLdsLayer<br/>K0"]
M["MPerBlock/MLdsLayer<br/>M"]
K1["KPack<br/>K1"]
end
.. mermaid::
subgraph "XOR Transform"
XT["make_xor_transform"]
end
graph TB
subgraph "3D LDS coordinate [K0, M, K1]"
K0["KPerBlock/KPack * MLdsLayer<br/>K0"]
M["MPerBlock/MLdsLayer<br/>M"]
K1["KPack<br/>K1"]
end
subgraph "XOR Transform"
XT["make_xor_transform"]
end
subgraph "Update K0 with XOR transformation"
K01["KPerBlock/KPack * MLdsLayer<br/>K0'"]
M1["MPerBlock/MLdsLayer<br/>M"]
K11["KPack<br/>K1"]
end
K0 --> XT
M --> XT
K1 --> K11
XT --> K01
XT --> M1
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
subgraph "Update K0 with XOR transformation"
K01["KPerBlock/KPack * MLdsLayer<br/>K0'"]
M1["MPerBlock/MLdsLayer<br/>M"]
K11["KPack<br/>K1"]
end
K0 --> XT
M --> XT
K1 --> K11
XT --> K01
XT --> M1
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/lds_index_swapping_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/lds_index_swapping_1.svg
:alt: Diagram
:align: center
@@ -83,45 +91,53 @@ The transformed K0' is split into L and K0'' components, creating an intermediat
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "3D LDS coordinate [K0', M, K1]"
K0["KPerBlock/KPack * MLdsLayer<br/>K0'"]
M["MPerBlock/MLdsLayer<br/>M"]
K1["KPack<br/>K1"]
end
.. mermaid::
subgraph "Unmerge into 2 components"
UM["make_unmerge_transform"]
end
graph TB
subgraph "3D LDS coordinate [K0', M, K1]"
K0["KPerBlock/KPack * MLdsLayer<br/>K0'"]
M["MPerBlock/MLdsLayer<br/>M"]
K1["KPack<br/>K1"]
end
subgraph "Unmerge into 2 components"
UM["make_unmerge_transform"]
end
subgraph "4D intermediate transformation space"
L["MLdsLayer<br/>L"]
M1["MPerBlock/MLdsLayer<br/>M"]
K01["KPerBlock/KPack<br/>K0''"]
K11["KPack<br/>K1"]
end
K0 --> UM
M --> M1
K1 --> K11
UM --> L
UM --> K01
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
subgraph "4D intermediate transformation space"
L["MLdsLayer<br/>L"]
M1["MPerBlock/MLdsLayer<br/>M"]
K01["KPerBlock/KPack<br/>K0''"]
K11["KPack<br/>K1"]
end
K0 --> UM
M --> M1
K1 --> K11
UM --> L
UM --> K01
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/lds_index_swapping_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/lds_index_swapping_2.svg
:alt: Diagram
:align: center
@@ -142,50 +158,58 @@ The final step merges the 4D coordinates back into 2D transformed coordinates (M
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "4D LDS Coordinates [L, M, K0'', K1]"
L["MLdsLayer<br/>L"]
M1["MPerBlock/MLdsLayer<br/>M"]
K0["KPerBlock/KPack<br/>K0''"]
K1["KPack<br/>K1"]
end
.. mermaid::
subgraph "Merge into 1 component"
ME0["make_merge_transform"]
end
graph TB
subgraph "4D LDS Coordinates [L, M, K0'', K1]"
L["MLdsLayer<br/>L"]
M1["MPerBlock/MLdsLayer<br/>M"]
K0["KPerBlock/KPack<br/>K0''"]
K1["KPack<br/>K1"]
end
subgraph "Merge into 1 component"
ME0["make_merge_transform"]
end
subgraph "Merge into 1 component"
ME1["make_merge_transform"]
end
subgraph "Transformed 2D coordinates [M', K']"
M11["MPerBlock<br/>M'"]
K01["KPerBlock<br/>K'"]
end
L --> ME0
M1 --> ME0
K0 --> ME1
K1 --> ME1
ME0 --> M11
ME1 --> K01
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
subgraph "Merge into 1 component"
ME1["make_merge_transform"]
end
subgraph "Transformed 2D coordinates [M', K']"
M11["MPerBlock<br/>M'"]
K01["KPerBlock<br/>K'"]
end
L --> ME0
M1 --> ME0
K0 --> ME1
K1 --> ME1
ME0 --> M11
ME1 --> K01
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/lds_index_swapping_3.svg
:alt: Diagram
:align: center
.. image:: diagrams/lds_index_swapping_3.svg
:alt: Diagram
:align: center

View File

@@ -106,27 +106,35 @@ LoadStoreTraits employs a advanced algorithm to select the best dimension for ve
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TD
A[Analyze Distribution] --> B{Check Each Dimension}
B --> C[Calculate Stride]
C --> D{Stride == 1?}
D -->|Yes| E[Candidate for Vectorization]
D -->|No| F[Skip Dimension]
E --> G[Check Alignment]
G --> H[Check Vector Size]
H --> I[Score Dimension]
F --> B
I --> J[Select Best Dimension]
J --> K[Configure Vector Access]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. mermaid::
graph TD
A[Analyze Distribution] --> B{Check Each Dimension}
B --> C[Calculate Stride]
C --> D{Stride == 1?}
D -->|Yes| E[Candidate for Vectorization]
D -->|No| F[Skip Dimension]
E --> G[Check Alignment]
G --> H[Check Vector Size]
H --> I[Score Dimension]
F --> B
I --> J[Select Best Dimension]
J --> K[Configure Vector Access]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/load_store_traits_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/load_store_traits_1.svg
:alt: Diagram
:align: center
@@ -174,36 +182,44 @@ LoadStoreTraits creates efficient access patterns using space-filling curves:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph LR
subgraph "Linear Traversal"
L1["0→1→2→3"]
L2["4→5→6→7"]
L3["Cache miss"]
L4["8→9→10→11"]
end
subgraph "Snake Pattern"
S1["0→1→2→3"]
S2["7←6←5←4"]
S3["Cache hit!"]
S4["8→9→10→11"]
end
L1 --> L2
L2 --> L3
L3 --> L4
S1 --> S2
S2 --> S3
S3 --> S4
style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. mermaid::
graph LR
subgraph "Linear Traversal"
L1["0→1→2→3"]
L2["4→5→6→7"]
L3["Cache miss"]
L4["8→9→10→11"]
end
subgraph "Snake Pattern"
S1["0→1→2→3"]
S2["7←6←5←4"]
S3["Cache hit!"]
S4["8→9→10→11"]
end
L1 --> L2
L2 --> L3
L3 --> L4
S1 --> S2
S2 --> S3
S3 --> S4
style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/load_store_traits_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/load_store_traits_2.svg
:alt: Diagram
:align: center

View File

@@ -188,36 +188,44 @@ The snake pattern reverses traversal direction on alternate rows, minimizing the
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph LR
subgraph "Linear Pattern"
L1["Row 0: →"]
L2["Row 1: →"]
L3["Jump back"]
L4["Row 2: →"]
end
subgraph "Snake Pattern"
S1["Row 0: →"]
S2["Row 1: ←"]
S3["Continue"]
S4["Row 2: "]
end
L1 --> L3
L3 --> L2
L2 --> L3
L3 --> L4
S1 --> S2
S2 --> S4
style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. mermaid::
graph LR
subgraph "Linear Pattern"
L1["Row 0: →"]
L2["Row 1: →"]
L3["Jump back"]
L4["Row 2: →"]
end
subgraph "Snake Pattern"
S1["Row 0: →"]
S2["Row 1: "]
S3["Continue"]
S4["Row 2: →"]
end
L1 --> L3
L3 --> L2
L2 --> L3
L3 --> L4
S1 --> S2
S2 --> S4
style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/space_filling_curve.svg
:alt: Diagram
:align: center
.. image:: diagrams/space_filling_curve.svg
:alt: Diagram
:align: center

View File

@@ -88,21 +88,29 @@ The memory layout follows a hierarchical pattern:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TD
A[Global Tensor 64x64] --> B[Thread Block 16x16]
B --> C[Thread 0,0<br/>Elements 0:3,0:3]
B --> D[Thread 0,1<br/>Elements 0:3,4:7]
B --> E[Thread 1,0<br/>Elements 4:7,0:3]
B --> F[...]
C --> G[Local Array<br/>16 elements]
D --> H[Local Array<br/>16 elements]
E --> I[Local Array<br/>16 elements]
.. mermaid::
graph TD
A[Global Tensor 64x64] --> B[Thread Block 16x16]
B --> C[Thread 0,0<br/>Elements 0:3,0:3]
B --> D[Thread 0,1<br/>Elements 0:3,4:7]
B --> E[Thread 1,0<br/>Elements 4:7,0:3]
B --> F[...]
C --> G[Local Array<br/>16 elements]
D --> H[Local Array<br/>16 elements]
E --> I[Local Array<br/>16 elements]
.. image:: diagrams/static_distributed_tensor.svg
:alt: Diagram
:align: center
.. image:: diagrams/static_distributed_tensor.svg
:alt: Diagram
:align: center

View File

@@ -18,35 +18,43 @@ The key insight is the "load once, use many times" pattern. Load X data once int
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
flowchart LR
subgraph "X-Tile (Reused)"
XT["X data loaded once<br/>Stays in registers"]
end
subgraph "Y-Sweep"
Y1["Y position 0"]
Y2["Y position 1"]
Y3["Y position 2"]
YN["Y position N"]
end
subgraph "Computation"
C["Process(X, Y)"]
end
XT --> C
Y1 --> C
Y2 --> C
Y3 --> C
YN --> C
style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
.. mermaid::
flowchart LR
subgraph "X-Tile (Reused)"
XT["X data loaded once<br/>Stays in registers"]
end
subgraph "Y-Sweep"
Y1["Y position 0"]
Y2["Y position 1"]
Y3["Y position 2"]
YN["Y position N"]
end
subgraph "Computation"
C["Process(X, Y)"]
end
XT --> C
Y1 --> C
Y2 --> C
Y3 --> C
YN --> C
style XT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style C fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
.. image:: diagrams/sweep_tile_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/sweep_tile_1.svg
:alt: Diagram
:align: center
@@ -122,30 +130,38 @@ The sweep pattern provides significant memory efficiency benefits:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Traditional Approach"
T1["Load X[0]"] --> P1["Process"]
T2["Load Y[0]"] --> P1
T3["Load X[0]"] --> P2["Process"]
T4["Load Y[1]"] --> P2
T5["Load X[0]"] --> P3["Process"]
T6["Load Y[2]"] --> P3
Note1["X loaded 3 times!"]
end
subgraph "Sweep Approach"
S1["Load X[0]"] --> SP["Process with<br/>Y[0], Y[1], Y[2]"]
S2["Load Y[0,1,2]"] --> SP
Note2["X loaded once!"]
end
style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. mermaid::
graph TB
subgraph "Traditional Approach"
T1["Load X[0]"] --> P1["Process"]
T2["Load Y[0]"] --> P1
T3["Load X[0]"] --> P2["Process"]
T4["Load Y[1]"] --> P2
T5["Load X[0]"] --> P3["Process"]
T6["Load Y[2]"] --> P3
Note1["X loaded 3 times!"]
end
subgraph "Sweep Approach"
S1["Load X[0]"] --> SP["Process with<br/>Y[0], Y[1], Y[2]"]
S2["Load Y[0,1,2]"] --> SP
Note2["X loaded once!"]
end
style Note1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style Note2 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/sweep_tile_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/sweep_tile_2.svg
:alt: Diagram
:align: center
@@ -381,37 +397,45 @@ Sweep operations provide several performance benefits:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Sweep Performance Benefits"
B1["Zero runtime overhead<br/>Compile-time unrolling"]
B2["Perfect memory coalescing<br/>Sequential access patterns"]
B3["Automatic vectorization<br/>Compiler optimizations"]
B4["Register reuse<br/>X data stays in VGPR"]
end
subgraph "Use Cases"
U1["Matrix Multiplication<br/>Reuse A columns"]
U2["Convolution<br/>Reuse filter weights"]
U3["Reduction<br/>Accumulate over Y"]
U4["Broadcast<br/>Apply X to all Y"]
end
B1 --> Performance["High Performance"]
B2 --> Performance
B3 --> Performance
B4 --> Performance
Performance --> U1
Performance --> U2
Performance --> U3
Performance --> U4
style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px
.. mermaid::
graph TB
subgraph "Sweep Performance Benefits"
B1["Zero runtime overhead<br/>Compile-time unrolling"]
B2["Perfect memory coalescing<br/>Sequential access patterns"]
B3["Automatic vectorization<br/>Compiler optimizations"]
B4["Register reuse<br/>X data stays in VGPR"]
end
subgraph "Use Cases"
U1["Matrix Multiplication<br/>Reuse A columns"]
U2["Convolution<br/>Reuse filter weights"]
U3["Reduction<br/>Accumulate over Y"]
U4["Broadcast<br/>Apply X to all Y"]
end
B1 --> Performance["High Performance"]
B2 --> Performance
B3 --> Performance
B4 --> Performance
Performance --> U1
Performance --> U2
Performance --> U3
Performance --> U4
style Performance fill:#d1fae5,stroke:#10b981,stroke-width:3px
.. image:: diagrams/sweep_tile_3.svg
:alt: Diagram
:align: center
.. image:: diagrams/sweep_tile_3.svg
:alt: Diagram
:align: center
@@ -453,28 +477,36 @@ Complete workflow example:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
flowchart TB
subgraph "Complete Workflow"
TD["TileDistribution<br/>Define data layout"]
TW["TileWindow<br/>Create view"]
DT["DistributedTensor<br/>Load X data"]
ST["SweepTile<br/>Iterate Y positions"]
R["Results<br/>Store outputs"]
end
TD --> TW
TW --> DT
DT --> ST
ST --> R
style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. mermaid::
flowchart TB
subgraph "Complete Workflow"
TD["TileDistribution<br/>Define data layout"]
TW["TileWindow<br/>Create view"]
DT["DistributedTensor<br/>Load X data"]
ST["SweepTile<br/>Iterate Y positions"]
R["Results<br/>Store outputs"]
end
TD --> TW
TW --> DT
DT --> ST
ST --> R
style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style ST fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style R fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/sweep_tile_4.svg
:alt: Diagram
:align: center
.. image:: diagrams/sweep_tile_4.svg
:alt: Diagram
:align: center

View File

@@ -13,40 +13,52 @@ The power of TensorView lies in its ability to present different logical views o
TensorView Architecture
-----------------------
.. raw:: html
<div class="mermaid" style="margin: 0 auto; display: block; width: 60%;">
graph TB
subgraph "Memory Foundation"
Memory["Flat Memory Array<br/>0 1 2 3 4 5 6 7 8 9 10 11"]
end
subgraph "Access Layer"
BufferView["BufferView<br/>Linear Memory Access"]
Descriptor["TensorDescriptor<br/>Shape & Stride Info"]
end
subgraph "Tensor Layer"
TensorView["TensorView<br/>Multi-dimensional Access"]
end
subgraph "Logical View"
Matrix["2D Matrix View<br/>[3×4]<br/>[[0,1,2,3]<br/>[4,5,6,7]<br/>[8,9,10,11]]"]
end
Memory --> BufferView
Memory --> Descriptor
BufferView --> TensorView
Descriptor --> TensorView
TensorView --> Matrix
style Memory fill:#d1fae5,stroke:#10b981,stroke-width:2px
style BufferView fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
style Descriptor fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
style TensorView fill:#fce7f3,stroke:#ec4899,stroke-width:2px
style Matrix fill:#e9d5ff,stroke:#9333ea,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Memory Foundation"
Memory["Flat Memory Array<br/>0 1 2 3 4 5 6 7 8 9 10 11"]
end
subgraph "Access Layer"
BufferView["BufferView<br/>Linear Memory Access"]
Descriptor["TensorDescriptor<br/>Shape & Stride Info"]
end
subgraph "Tensor Layer"
TensorView["TensorView<br/>Multi-dimensional Access"]
end
subgraph "Logical View"
Matrix["2D Matrix View<br/>[3×4]<br/>[[0,1,2,3]<br/>[4,5,6,7]<br/>[8,9,10,11]]"]
end
Memory --> BufferView
Memory --> Descriptor
BufferView --> TensorView
Descriptor --> TensorView
TensorView --> Matrix
style Memory fill:#d1fae5,stroke:#10b981,stroke-width:2px
style BufferView fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
style Descriptor fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
style TensorView fill:#fce7f3,stroke:#ec4899,stroke-width:2px
style Matrix fill:#e9d5ff,stroke:#9333ea,stroke-width:2px
.. image:: diagrams/tensor_views_1.svg
:alt: Diagram
:align: center
The Foundation: BufferView and TensorDescriptor
------------------------------------------------
@@ -109,68 +121,92 @@ Coordinate-Based Access
The fundamental operation of TensorView is translating multi-dimensional coordinates into memory accesses. This translation happens through a advanced pipeline that maintains efficiency while providing flexibility:
.. raw:: html
<div class="mermaid">
flowchart LR
subgraph "User Input"
Coord["Coordinate<br/>(1, 2)"]
end
subgraph "TensorView Processing"
Shape["Shape Check<br/>row < 3?<br/>col < 4?"]
Stride["Apply Strides<br/>offset = 1×4 + 2×1"]
Buffer["BufferView Access<br/>buffer[6]"]
end
subgraph "Result"
Value["Value: 6"]
end
Coord --> Shape
Shape -->|Valid| Stride
Stride --> Buffer
Buffer --> Value
style Coord fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style Shape fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style Stride fill:#dcfce7,stroke:#10b981,stroke-width:2px
style Buffer fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
style Value fill:#d1fae5,stroke:#10b981,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart LR
subgraph "User Input"
Coord["Coordinate<br/>(1, 2)"]
end
subgraph "TensorView Processing"
Shape["Shape Check<br/>row < 3?<br/>col < 4?"]
Stride["Apply Strides<br/>offset = 1×4 + 2×1"]
Buffer["BufferView Access<br/>buffer[6]"]
end
subgraph "Result"
Value["Value: 6"]
end
Coord --> Shape
Shape -->|Valid| Stride
Stride --> Buffer
Buffer --> Value
style Coord fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style Shape fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style Stride fill:#dcfce7,stroke:#10b981,stroke-width:2px
style Buffer fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
style Value fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/tensor_views_2.svg
:alt: Diagram
:align: center
Memory Layouts and Strides
--------------------------
One of the most key features of TensorView is its ability to represent different memory layouts through stride manipulation. This capability enables zero-copy transformations that would otherwise require expensive memory operations:
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Row-Major Layout (C-style)"
RM["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (3,4)<br/>Strides: (4,1)"]
RMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
RM --> RMMatrix
end
subgraph "Column-Major Layout (Fortran-style)"
CM["Memory: [0,3,6,9,1,4,7,10,2,5,8,11]<br/>Shape: (3,4)<br/>Strides: (1,3)"]
CMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
CM --> CMMatrix
end
subgraph "Custom Stride (Transposed View)"
TV["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (4,3)<br/>Strides: (1,4)"]
TVMatrix["[[0, 4, 8]<br/> [1, 5, 9]<br/> [2, 6, 10]<br/> [3, 7, 11]]"]
TV --> TVMatrix
end
style RM fill:#e0f2fe,stroke:#0284c7,stroke-width:2px
style CM fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style TV fill:#f3e8ff,stroke:#9333ea,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Row-Major Layout (C-style)"
RM["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (3,4)<br/>Strides: (4,1)"]
RMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
RM --> RMMatrix
end
subgraph "Column-Major Layout (Fortran-style)"
CM["Memory: [0,3,6,9,1,4,7,10,2,5,8,11]<br/>Shape: (3,4)<br/>Strides: (1,3)"]
CMMatrix["[[0, 1, 2, 3]<br/> [4, 5, 6, 7]<br/> [8, 9, 10, 11]]"]
CM --> CMMatrix
end
subgraph "Custom Stride (Transposed View)"
TV["Memory: [0,1,2,3,4,5,6,7,8,9,10,11]<br/>Shape: (4,3)<br/>Strides: (1,4)"]
TVMatrix["[[0, 4, 8]<br/> [1, 5, 9]<br/> [2, 6, 10]<br/> [3, 7, 11]]"]
TV --> TVMatrix
end
style RM fill:#e0f2fe,stroke:#0284c7,stroke-width:2px
style CM fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style TV fill:#f3e8ff,stroke:#9333ea,stroke-width:2px
.. image:: diagrams/tensor_views_3.svg
:alt: Diagram
:align: center
Row-Major vs Column-Major Layouts
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -287,33 +323,45 @@ Memory Access Patterns
The efficiency of TensorView operations depends critically on memory access patterns. Understanding these patterns is essential for achieving optimal performance (see :ref:`ck_tile_gpu_basics` for hardware considerations):
.. raw:: html
<div class="mermaid" style="margin: 0 auto; display: block; width:70%;">
graph LR
subgraph "Memory Access Patterns"
Seq["Sequential Access<br/>(Good cache usage)"]
Stride["Strided Access<br/>(May cause cache misses)"]
Random["Random Access<br/>(Poor cache usage)"]
end
subgraph "Optimization Strategies"
Opt1["Use row-major for row iteration"]
Opt2["Use col-major for column iteration"]
Opt3["Minimize stride between accesses"]
Opt4["Vectorize when possible"]
end
Seq --> Opt1
Stride --> Opt2
Stride --> Opt3
Random --> Opt4
style Seq fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Stride fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style Random fill:#fee2e2,stroke:#ef4444,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Memory Access Patterns"
Seq["Sequential Access<br/>(Good cache usage)"]
Stride["Strided Access<br/>(May cause cache misses)"]
Random["Random Access<br/>(Poor cache usage)"]
end
subgraph "Optimization Strategies"
Opt1["Use row-major for row iteration"]
Opt2["Use col-major for column iteration"]
Opt3["Minimize stride between accesses"]
Opt4["Vectorize when possible"]
end
Seq --> Opt1
Stride --> Opt2
Stride --> Opt3
Random --> Opt4
style Seq fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Stride fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style Random fill:#fee2e2,stroke:#ef4444,stroke-width:2px
.. image:: diagrams/tensor_views_4.svg
:alt: Diagram
:align: center
Compile-Time Optimization
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -339,36 +387,48 @@ TensorView vs BufferView
Understanding when to use TensorView versus BufferView is crucial for writing efficient code:
.. raw:: html
<div class="mermaid">
graph TB
subgraph "BufferView"
BV1["Linear indexing only"]
BV2["buffer[5]"]
BV3["No shape information"]
BV4["Direct memory access"]
end
subgraph "TensorView"
TV1["Multi-dimensional indexing"]
TV2["tensor(1, 2)"]
TV3["Shape-aware operations"]
TV4["Coordinate transformations"]
end
subgraph "Use Cases"
UC1["BufferView: Low-level memory ops"]
UC2["TensorView: Matrix/tensor algorithms"]
end
BV1 --> UC1
TV1 --> UC2
style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "BufferView"
BV1["Linear indexing only"]
BV2["buffer[5]"]
BV3["No shape information"]
BV4["Direct memory access"]
end
subgraph "TensorView"
TV1["Multi-dimensional indexing"]
TV2["tensor(1, 2)"]
TV3["Shape-aware operations"]
TV4["Coordinate transformations"]
end
subgraph "Use Cases"
UC1["BufferView: Low-level memory ops"]
UC2["TensorView: Matrix/tensor algorithms"]
end
BV1 --> UC1
TV1 --> UC2
style BV1 fill:#dbeafe,stroke:#3b82f6,stroke-width:2px
style TV1 fill:#fce7f3,stroke:#ec4899,stroke-width:2px
.. image:: diagrams/tensor_views_5.svg
:alt: Diagram
:align: center
BufferView excels at raw memory operations where linear access is natural or where the overhead of coordinate calculation would be prohibitive. TensorView shines when algorithms naturally think in terms of multi-dimensional coordinates, such as matrix operations, image processing, or tensor contractions.
Integration with Tile Distribution

View File

@@ -82,53 +82,61 @@ Composable Kernel abstracts thread identification into partition indices, buildi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "GPU Device"
subgraph "Thread Block"
subgraph "Warp 0"
T0["Thread 0<br/>lane_id=0"]
T1["Thread 1<br/>lane_id=1"]
T2["..."]
T31["Thread 31<br/>lane_id=31"]
end
subgraph "Warp 1"
T32["Thread 32<br/>lane_id=0"]
T33["Thread 33<br/>lane_id=1"]
T34["..."]
T63["Thread 63<br/>lane_id=31"]
end
W2["Warp 2"]
W3["..."]
W7["Warp 7"]
end
end
subgraph "Thread Identification"
TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"]
WID["Warp ID = threadIdx.x / 32"]
LID["Lane ID = threadIdx.x % 32"]
end
subgraph "P-space Mapping"
P["P-coordinates<br/>NDimP=1: [thread_id]<br/>NDimP=2: [warp_id, lane_id]"]
end
T0 --> TID
TID --> WID
TID --> LID
WID --> P
LID --> P
style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px
.. mermaid::
graph TB
subgraph "GPU Device"
subgraph "Thread Block"
subgraph "Warp 0"
T0["Thread 0<br/>lane_id=0"]
T1["Thread 1<br/>lane_id=1"]
T2["..."]
T31["Thread 31<br/>lane_id=31"]
end
subgraph "Warp 1"
T32["Thread 32<br/>lane_id=0"]
T33["Thread 33<br/>lane_id=1"]
T34["..."]
T63["Thread 63<br/>lane_id=31"]
end
W2["Warp 2"]
W3["..."]
W7["Warp 7"]
end
end
subgraph "Thread Identification"
TID["Thread ID = blockIdx.x * blockDim.x + threadIdx.x"]
WID["Warp ID = threadIdx.x / 32"]
LID["Lane ID = threadIdx.x % 32"]
end
subgraph "P-space Mapping"
P["P-coordinates<br/>NDimP=1: [thread_id]<br/>NDimP=2: [warp_id, lane_id]"]
end
T0 --> TID
TID --> WID
TID --> LID
WID --> P
LID --> P
style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style P fill:#fff3e0,stroke:#f57c00,stroke-width:3px
.. image:: diagrams/thread_mapping_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/thread_mapping_1.svg
:alt: Diagram
:align: center
@@ -179,43 +187,51 @@ Once threads know their IDs, they need to map those IDs to specific data element
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Thread to Data Mapping"
subgraph "Thread Grid"
T00["Thread[0,0]<br/>Warp 0"]
T01["Thread[0,1]<br/>Warp 0"]
T10["Thread[1,0]<br/>Warp 1"]
T11["Thread[1,1]<br/>Warp 1"]
end
subgraph "Data Tiles"
D00["Data[0:4, 0:4]<br/>16 elements"]
D01["Data[0:4, 4:8]<br/>16 elements"]
D10["Data[4:8, 0:4]<br/>16 elements"]
D11["Data[4:8, 4:8]<br/>16 elements"]
end
subgraph "Memory Access"
MA["Coalesced Access<br/>Adjacent threads → Adjacent memory"]
end
end
T00 --> D00
T01 --> D01
T10 --> D10
T11 --> D11
D00 --> MA
D01 --> MA
style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. mermaid::
graph TB
subgraph "Thread to Data Mapping"
subgraph "Thread Grid"
T00["Thread[0,0]<br/>Warp 0"]
T01["Thread[0,1]<br/>Warp 0"]
T10["Thread[1,0]<br/>Warp 1"]
T11["Thread[1,1]<br/>Warp 1"]
end
subgraph "Data Tiles"
D00["Data[0:4, 0:4]<br/>16 elements"]
D01["Data[0:4, 4:8]<br/>16 elements"]
D10["Data[4:8, 0:4]<br/>16 elements"]
D11["Data[4:8, 4:8]<br/>16 elements"]
end
subgraph "Memory Access"
MA["Coalesced Access<br/>Adjacent threads → Adjacent memory"]
end
end
T00 --> D00
T01 --> D01
T10 --> D10
T11 --> D11
D00 --> MA
D01 --> MA
style T00 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style D00 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MA fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/thread_mapping_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/thread_mapping_2.svg
:alt: Diagram
:align: center

View File

@@ -19,79 +19,103 @@ The elegance of this design lies in its ability to adapt to diverse computationa
Complete Tile Distribution System Overview
------------------------------------------
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Logical View"
T["Tensor<br/>Multi-dimensional data"]
TD["TileDistribution<br/>Work assignment"]
TW["TileWindow<br/>Data view"]
end
subgraph "Coordinate Spaces"
X["X: Physical tensor coords"]
Y["Y: Tile pattern coords"]
P["P: Processing element coords"]
R["R: Replication coords (optional)"]
end
subgraph "GPU Execution"
W["Warps<br/>32 threads each"]
L["Lanes<br/>Thread within warp"]
REG["Registers<br/>Thread-local storage"]
end
T --> TD
TD --> TW
TD --> X
TD --> Y
TD --> P
TD --> R
P --> W
P --> L
TW --> REG
style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Logical View"
T["Tensor<br/>Multi-dimensional data"]
TD["TileDistribution<br/>Work assignment"]
TW["TileWindow<br/>Data view"]
end
subgraph "Coordinate Spaces"
X["X: Physical tensor coords"]
Y["Y: Tile pattern coords"]
P["P: Processing element coords"]
R["R: Replication coords (optional)"]
end
subgraph "GPU Execution"
W["Warps<br/>32 threads each"]
L["Lanes<br/>Thread within warp"]
REG["Registers<br/>Thread-local storage"]
end
T --> TD
TD --> TW
TD --> X
TD --> Y
TD --> P
TD --> R
P --> W
P --> L
TW --> REG
style TD fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style P fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style REG fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/tile_distribution_1.svg
:alt: Diagram
:align: center
Coordinate System Architecture
------------------------------
.. raw:: html
<div class="mermaid">
flowchart LR
subgraph "Input"
TC["Thread Coordinates<br/>(warpId, laneId)"]
end
subgraph "Transformation Pipeline"
P2Y["P → Y<br/>Thread to pattern"]
Y2X["Y → X<br/>Pattern to physical"]
Y2D["Y → D<br/>Pattern to register"]
end
subgraph "Output"
MC["Memory Coordinates<br/>Global addresses"]
RI["Register Indices<br/>Local storage"]
end
TC --> P2Y
P2Y --> Y2X
P2Y --> Y2D
Y2X --> MC
Y2D --> RI
style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px
style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart LR
subgraph "Input"
TC["Thread Coordinates<br/>(warpId, laneId)"]
end
subgraph "Transformation Pipeline"
P2Y["P → Y<br/>Thread to pattern"]
Y2X["Y → X<br/>Pattern to physical"]
Y2D["Y → D<br/>Pattern to register"]
end
subgraph "Output"
MC["Memory Coordinates<br/>Global addresses"]
RI["Register Indices<br/>Local storage"]
end
TC --> P2Y
P2Y --> Y2X
P2Y --> Y2D
Y2X --> MC
Y2D --> RI
style TC fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style MC fill:#d1fae5,stroke:#10b981,stroke-width:2px
style RI fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
.. image:: diagrams/tile_distribution_2.svg
:alt: Diagram
:align: center
What is Tile Distribution?
--------------------------
@@ -132,41 +156,53 @@ The key insight that makes TileDistribution effective is its ability to abstract
Problem Space Mapping
---------------------
.. raw:: html
<div class="mermaid" style="margin: 0 auto; display: block; width: 60%;">
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Problem Space (256×256 Matrix)"
M["Full Matrix<br/>65,536 elements"]
T1["Tile 1<br/>32×32"]
T2["Tile 2<br/>32×32"]
TN["Tile N<br/>32×32"]
end
subgraph "Thread Assignment"
W0["Warp 0<br/>32 threads"]
W1["Warp 1<br/>32 threads"]
L0["Lane 0-31<br/>Individual threads"]
end
subgraph "Memory Pattern"
MP["Coalesced Access<br/>Sequential addresses<br/>No bank conflicts"]
end
M --> T1
M --> T2
M --> TN
T1 --> W0
T1 --> W1
W0 --> L0
L0 --> MP
style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px
graph TB
subgraph "Problem Space (256×256 Matrix)"
M["Full Matrix<br/>65,536 elements"]
T1["Tile 1<br/>32×32"]
T2["Tile 2<br/>32×32"]
TN["Tile N<br/>32×32"]
end
subgraph "Thread Assignment"
W0["Warp 0<br/>32 threads"]
W1["Warp 1<br/>32 threads"]
L0["Lane 0-31<br/>Individual threads"]
end
subgraph "Memory Pattern"
MP["Coalesced Access<br/>Sequential addresses<br/>No bank conflicts"]
end
M --> T1
M --> T2
M --> TN
T1 --> W0
T1 --> W1
W0 --> L0
L0 --> MP
style M fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style MP fill:#d1fae5,stroke:#10b981,stroke-width:2px
</div>
.. image:: diagrams/tile_distribution_3.svg
:alt: Diagram
:align: center
Creating a TileDistribution
---------------------------
@@ -339,38 +375,50 @@ Let's see how to create and use a TileDistribution in practice:
Hierarchical Decomposition
--------------------------
.. raw:: html
<div class="mermaid" style="margin: 0 auto; display: block; width: 45%;">
graph TB
subgraph "Level 1: Block Distribution"
B["Thread Block<br/>256 threads"]
BT1["Block Tile 1<br/>64×64"]
BT2["Block Tile 2<br/>64×64"]
end
subgraph "Level 2: Warp Distribution"
W["Warp<br/>32 threads"]
WT1["Warp Tile 1<br/>16×16"]
WT2["Warp Tile 2<br/>16×16"]
end
subgraph "Level 3: Thread Distribution"
T["Thread"]
TT["Thread Tile<br/>2×2"]
end
B --> BT1
BT1 --> W
W --> WT1
WT1 --> T
T --> TT
style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Level 1: Block Distribution"
B["Thread Block<br/>256 threads"]
BT1["Block Tile 1<br/>64×64"]
BT2["Block Tile 2<br/>64×64"]
end
subgraph "Level 2: Warp Distribution"
W["Warp<br/>32 threads"]
WT1["Warp Tile 1<br/>16×16"]
WT2["Warp Tile 2<br/>16×16"]
end
subgraph "Level 3: Thread Distribution"
T["Thread"]
TT["Thread Tile<br/>2×2"]
end
B --> BT1
BT1 --> W
W --> WT1
WT1 --> T
T --> TT
style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style W fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style T fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/tile_distribution_4.svg
:alt: Diagram
:align: center
Advanced Example: Matrix Multiplication Distribution
----------------------------------------------------
@@ -422,36 +470,48 @@ Advanced Example: Matrix Multiplication Distribution
Work Distribution Pattern
-------------------------
.. raw:: html
<div class="mermaid" style="margin: 0 auto; display: block; width: 30%;">
flowchart TB
subgraph "Matrix C (128×128)"
C["16,384 elements"]
end
subgraph "Thread Grid (32×32)"
TG["1,024 threads"]
end
subgraph "Per Thread"
PT["4×4 tile<br/>16 elements"]
end
subgraph "Memory Access"
MA["Coalesced reads<br/>Efficient writes<br/>No conflicts"]
end
C --> TG
TG --> PT
PT --> MA
style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart TB
subgraph "Matrix C (128×128)"
C["16,384 elements"]
end
subgraph "Thread Grid (32×32)"
TG["1,024 threads"]
end
subgraph "Per Thread"
PT["4×4 tile<br/>16 elements"]
end
subgraph "Memory Access"
MA["Coalesced reads<br/>Efficient writes<br/>No conflicts"]
end
C --> TG
TG --> PT
PT --> MA
style C fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style TG fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style PT fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MA fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/tile_distribution_5.svg
:alt: Diagram
:align: center
Memory Access Patterns
----------------------
@@ -464,79 +524,103 @@ One of the key benefits of TileDistribution is generating optimal memory access
Transformation Pipeline
-----------------------
.. raw:: html
<div class="mermaid" style="margin: 0 auto; display: block; width: 100%;">
graph LR
subgraph "Input"
TID["Thread ID<br/>(0-1023)"]
end
subgraph "Stage 1"
P["P-coordinates<br/>(warp, lane)"]
end
subgraph "Stage 2"
Y["Y-coordinates<br/>(tile position)"]
end
subgraph "Stage 3"
X["X-coordinates<br/>(tensor indices)"]
end
subgraph "Output"
ADDR["Memory addresses<br/>Register indices"]
end
TID --> P
P --> Y
Y --> X
X --> ADDR
style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Input"
TID["Thread ID<br/>(0-1023)"]
end
subgraph "Stage 1"
P["P-coordinates<br/>(warp, lane)"]
end
subgraph "Stage 2"
Y["Y-coordinates<br/>(tile position)"]
end
subgraph "Stage 3"
X["X-coordinates<br/>(tensor indices)"]
end
subgraph "Output"
ADDR["Memory addresses<br/>Register indices"]
end
TID --> P
P --> Y
Y --> X
X --> ADDR
style TID fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ADDR fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/tile_distribution_6.svg
:alt: Diagram
:align: center
Performance Comparison
----------------------
.. raw:: html
<div class="mermaid">
graph TB
subgraph "Manual Implementation"
M1["Calculate indices manually"]
M2["Handle boundary conditions"]
M3["Ensure coalescing"]
M4["Manage bank conflicts"]
M5["~200 lines of code"]
end
subgraph "With TileDistribution"
T1["make_tile_distribution()"]
T2["Automatic optimization"]
T3["~10 lines of code"]
end
subgraph "Performance"
P1["Same performance"]
P2["Fewer bugs"]
P3["Portable across GPUs"]
end
M1 --> M5
T1 --> T3
M5 --> P1
T3 --> P1
P1 --> P2
P2 --> P3
style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
</div>
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Manual Implementation"
M1["Calculate indices manually"]
M2["Handle boundary conditions"]
M3["Ensure coalescing"]
M4["Manage bank conflicts"]
M5["~200 lines of code"]
end
subgraph "With TileDistribution"
T1["make_tile_distribution()"]
T2["Automatic optimization"]
T3["~10 lines of code"]
end
subgraph "Performance"
P1["Same performance"]
P2["Fewer bugs"]
P3["Portable across GPUs"]
end
M1 --> M5
T1 --> T3
M5 --> P1
T3 --> P1
P1 --> P2
P2 --> P3
style M5 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
style P3 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
.. image:: diagrams/tile_distribution_7.svg
:alt: Diagram
:align: center
Summary
-------

View File

@@ -32,28 +32,36 @@ Zero-Copy Logical Operations
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Tensor Coordinate Transformation"
US["Lower Dimension Space<br/>Source coordinate system"]
LS["Upper Dimension Space<br/>Target coordinate system"]
DATA["Linear Data in Memory<br/>Layout determined by tensor<br/>shape & strides"]
end
US -->|"Forward Transform"| LS
LS -->|"Inverse Transform"| US
DATA -.->|"Same data,<br/>different views"| US
DATA -.->|"Same data,<br/>different views"| LS
style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "Tensor Coordinate Transformation"
US["Lower Dimension Space<br/>Source coordinate system"]
LS["Upper Dimension Space<br/>Target coordinate system"]
DATA["Linear Data in Memory<br/>Layout determined by tensor<br/>shape & strides"]
end
US -->|"Forward Transform"| LS
LS -->|"Inverse Transform"| US
DATA -.->|"Same data,<br/>different views"| US
DATA -.->|"Same data,<br/>different views"| LS
style US fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style LS fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_1.svg
:alt: Diagram
:align: center
@@ -74,38 +82,46 @@ Transform System Architecture
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "Transform Types"
EMB["EmbedTransform<br/>Linear → Multi-D Strided"]
UNM["MergeTransform<br/>Multi-D → Linear"]
MRG["UnmergeTransform<br/>Linear → Multi-D"]
REP["ReplicateTransform<br/>0D → Multi-D Broadcast"]
OFF["OffsetTransform<br/>Translation"]
PAS["PassThroughTransform<br/>Identity"]
PAD["PadTransform<br/>Boundaries"]
end
subgraph "Operations"
FWD["Forward<br/>calculate_lower_index()"]
BWD["Backward<br/>calculate_upper_index()"]
UPD["Update<br/>update_lower_index()"]
end
EMB --> FWD
UNM --> FWD
MRG --> FWD
REP --> FWD
OFF --> FWD
PAS --> FWD
PAD --> FWD
style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. mermaid::
graph TB
subgraph "Transform Types"
EMB["EmbedTransform<br/>Linear → Multi-D Strided"]
UNM["MergeTransform<br/>Multi-D → Linear"]
MRG["UnmergeTransform<br/>Linear → Multi-D"]
REP["ReplicateTransform<br/>0D → Multi-D Broadcast"]
OFF["OffsetTransform<br/>Translation"]
PAS["PassThroughTransform<br/>Identity"]
PAD["PadTransform<br/>Boundaries"]
end
subgraph "Operations"
FWD["Forward<br/>calculate_lower_index()"]
BWD["Backward<br/>calculate_upper_index()"]
UPD["Update<br/>update_lower_index()"]
end
EMB --> FWD
UNM --> FWD
MRG --> FWD
REP --> FWD
OFF --> FWD
PAS --> FWD
PAD --> FWD
style FWD fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/transforms_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_2.svg
:alt: Diagram
:align: center
@@ -117,28 +133,36 @@ MergeTransform collapses multiple dimensions from the lower coordinate space int
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "MergeTransform: Multi-D → Linear"
LS["Lower Coordinate Space<br/>2D: [4, 5]<br/>Coord: (2, 3)"]
US["Upper Coordinate Space<br/>1D Linear<br/>Index: 13"]
DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 20 elements"]
end
LS -->|"Forward Transform<br/>2×5 + 3 = 13"| US
US -->|"Inverse Transform<br/>13÷5=2, 13%5=3"| LS
DATA -.->|"Multi-dimensional<br/>view"| LS
DATA -.->|"Linear<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "MergeTransform: Multi-D → Linear"
LS["Lower Coordinate Space<br/>2D: [4, 5]<br/>Coord: (2, 3)"]
US["Upper Coordinate Space<br/>1D Linear<br/>Index: 13"]
DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 20 elements"]
end
LS -->|"Forward Transform<br/>2×5 + 3 = 13"| US
US -->|"Inverse Transform<br/>13÷5=2, 13%5=3"| LS
DATA -.->|"Multi-dimensional<br/>view"| LS
DATA -.->|"Linear<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_3.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_3.svg
:alt: Diagram
:align: center
@@ -180,28 +204,36 @@ UnmergeTransform expands coordinates from a single dimension in the lower coordi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "UnmergeTransform: Linear → Multi-D"
LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
US["Upper Coordinate Space<br/>3D: [3, 4, 2]<br/>Coord: (1, 3, 0)"]
DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 24 elements"]
end
LS -->|"Forward Transform<br/>14 = 1×8 + 3×2 + 0"| US
US -->|"Inverse Transform<br/>linearize back"| LS
DATA -.->|"Linear<br/>view"| LS
DATA -.->|"Multi-dimensional<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "UnmergeTransform: Linear → Multi-D"
LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
US["Upper Coordinate Space<br/>3D: [3, 4, 2]<br/>Coord: (1, 3, 0)"]
DATA["Same Tensor Data<br/>Layout: row-major<br/>Size: 24 elements"]
end
LS -->|"Forward Transform<br/>14 = 1×8 + 3×2 + 0"| US
US -->|"Inverse Transform<br/>linearize back"| LS
DATA -.->|"Linear<br/>view"| LS
DATA -.->|"Multi-dimensional<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_4.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_4.svg
:alt: Diagram
:align: center
@@ -253,28 +285,36 @@ EmbedTransform expands linear indices from the lower coordinate space into multi
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "EmbedTransform: Linear → Multi-D Strided"
LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
US["Upper Coordinate Space<br/>2D: [2, 3]<br/>Coord: (1, 2)"]
DATA["Linear Buffer in Memory"]
end
LS -->|"Forward Transform <br/>Strides: [12, 1] <br/>14 ÷ 12 = 1, 14 % 12 = 2"| US
US -->|"Inverse Transform<br/>1×12 + 2×1 = 14"| LS
DATA -.->|"Linear<br/>index view"| LS
DATA -.->|"Multi-dimensional<br/>strided view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "EmbedTransform: Linear → Multi-D Strided"
LS["Lower Coordinate Space<br/>1D Linear<br/>Index: 14"]
US["Upper Coordinate Space<br/>2D: [2, 3]<br/>Coord: (1, 2)"]
DATA["Linear Buffer in Memory"]
end
LS -->|"Forward Transform <br/>Strides: [12, 1] <br/>14 ÷ 12 = 1, 14 % 12 = 2"| US
US -->|"Inverse Transform<br/>1×12 + 2×1 = 14"| LS
DATA -.->|"Linear<br/>index view"| LS
DATA -.->|"Multi-dimensional<br/>strided view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_5.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_5.svg
:alt: Diagram
:align: center
@@ -315,28 +355,36 @@ ReplicateTransform creates a higher-dimensional tensor by replicating (broadcast
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "ReplicateTransform: 0D → Multi-D Broadcasting"
LS["Lower Coordinate Space<br/>0D: Scalar<br/>Empty coordinate []"]
US["Upper Coordinate Space<br/>2D: [3, 4]<br/>All coords: (i, j)"]
DATA["Single Scalar Value"]
end
LS -->|"Forward Transform<br/>[] → (i,j) for any i,j"| US
US -->|"Inverse Transform<br/>(i,j) → [] for any i,j"| LS
DATA -.->|"One scalar<br/>value"| LS
DATA -.->|"Broadcasted view<br/>at all positions"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "ReplicateTransform: 0D → Multi-D Broadcasting"
LS["Lower Coordinate Space<br/>0D: Scalar<br/>Empty coordinate []"]
US["Upper Coordinate Space<br/>2D: [3, 4]<br/>All coords: (i, j)"]
DATA["Single Scalar Value"]
end
LS -->|"Forward Transform<br/>[] → (i,j) for any i,j"| US
US -->|"Inverse Transform<br/>(i,j) → [] for any i,j"| LS
DATA -.->|"One scalar<br/>value"| LS
DATA -.->|"Broadcasted view<br/>at all positions"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_6.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_6.svg
:alt: Diagram
:align: center
@@ -388,28 +436,36 @@ OffsetTransform shifts coordinates by a fixed offset, creating a translated view
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "OffsetTransform: 1D → 1D Translation"
LS["Lower Coordinate Space<br/>1D: [0, 63]<br/>Coord: index + offset"]
US["Upper Coordinate Space<br/>1D: [0, 47]<br/>Coord: index"]
DATA["Linear Buffer in Memory"]
end
LS -->|"Forward Transform<br/>idx → idx + 16"| US
US -->|"Inverse Transform<br/>idx + 16 → idx"| LS
DATA -.->|"Lower<br/>view"| LS
DATA -.->|"Upper<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "OffsetTransform: 1D → 1D Translation"
LS["Lower Coordinate Space<br/>1D: [0, 63]<br/>Coord: index + offset"]
US["Upper Coordinate Space<br/>1D: [0, 47]<br/>Coord: index"]
DATA["Linear Buffer in Memory"]
end
LS -->|"Forward Transform<br/>idx → idx + 16"| US
US -->|"Inverse Transform<br/>idx + 16 → idx"| LS
DATA -.->|"Lower<br/>view"| LS
DATA -.->|"Upper<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_7.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_7.svg
:alt: Diagram
:align: center
@@ -462,28 +518,36 @@ No-op transform that passes coordinates unchanged. The PassThrough transform is
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "PassThroughTransform: 1D → 1D Identity"
LS["Lower Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
US["Upper Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
DATA["Linear Buffer in Memory"]
end
LS -.->|"Perfect Identity<br/>idx → idx"| US
US -.->|"Perfect Identity<br/>idx → idx"| LS
DATA -->|"Same buffer<br/>same view"| LS
DATA -->|"Same buffer<br/>same view"| US
style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "PassThroughTransform: 1D → 1D Identity"
LS["Lower Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
US["Upper Coordinate Space<br/>1D: [0, 59]<br/>Coord: index"]
DATA["Linear Buffer in Memory"]
end
LS -.->|"Perfect Identity<br/>idx → idx"| US
US -.->|"Perfect Identity<br/>idx → idx"| LS
DATA -->|"Same buffer<br/>same view"| LS
DATA -->|"Same buffer<br/>same view"| US
style LS fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
style US fill:#e8f5e8,stroke:#2e7d32,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_8.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_8.svg
:alt: Diagram
:align: center
@@ -531,28 +595,36 @@ PadTransform adds padding to tensor dimensions, mapping coordinates from upper d
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "PadTransform: 1D → 1D with Padding"
LS["Lower Coordinate Space<br/>1D: [0, 2] (original data)"]
US["Upper Coordinate Space<br/>1D: [0, 4] (with padding)"]
DATA["Tensor Data in Memory"]
end
LS -->|"Forward Transform<br/>idx + left_pad"| US
US -->|"Inverse Transform<br/>idx - left_pad"| LS
DATA -.->|"Original view"| LS
DATA -.->|"Padded view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "PadTransform: 1D → 1D with Padding"
LS["Lower Coordinate Space<br/>1D: [0, 2] (original data)"]
US["Upper Coordinate Space<br/>1D: [0, 4] (with padding)"]
DATA["Tensor Data in Memory"]
end
LS -->|"Forward Transform<br/>idx + left_pad"| US
US -->|"Inverse Transform<br/>idx - left_pad"| LS
DATA -.->|"Original view"| LS
DATA -.->|"Padded view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_9.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_9.svg
:alt: Diagram
:align: center
@@ -603,28 +675,36 @@ XorTransform applies a 2D XOR mapping for specialized memory access patterns. It
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "XorTransform: 2D → 2D XOR Mapping"
LS["Lower Coordinate Space<br/>2D: [4, 8]<br/>XOR-transformed coords"]
US["Upper Coordinate Space<br/>2D: [4, 8]<br/>Normal coords"]
DATA["Same Tensor Data"]
end
LS -->|"Forward Transform<br/>apply XOR reverse"| US
US -->|"Inverse Transform<br/>apply XOR mapping"| LS
DATA -.->|"XOR pattern<br/>view"| LS
DATA -.->|"Normal<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "XorTransform: 2D → 2D XOR Mapping"
LS["Lower Coordinate Space<br/>2D: [4, 8]<br/>XOR-transformed coords"]
US["Upper Coordinate Space<br/>2D: [4, 8]<br/>Normal coords"]
DATA["Same Tensor Data"]
end
LS -->|"Forward Transform<br/>apply XOR reverse"| US
US -->|"Inverse Transform<br/>apply XOR mapping"| LS
DATA -.->|"XOR pattern<br/>view"| LS
DATA -.->|"Normal<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_10.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_10.svg
:alt: Diagram
:align: center
@@ -636,28 +716,36 @@ SliceTransform extracts a sub-region from a tensor dimension.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "SliceTransform: 1D → 1D Sub-region"
LS["Lower Coordinate Space<br/>1D: [0, 9] (original range)"]
US["Upper Coordinate Space<br/>1D: [0, 4] (slice range)"]
DATA["Tensor Data in Memory"]
end
LS -->|"Forward Transform<br/>idx + slice_begin"| US
US -->|"Inverse Transform<br/>idx - slice_begin"| LS
DATA -.->|"Full tensor<br/>view"| LS
DATA -.->|"Sub-region<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "SliceTransform: 1D → 1D Sub-region"
LS["Lower Coordinate Space<br/>1D: [0, 9] (original range)"]
US["Upper Coordinate Space<br/>1D: [0, 4] (slice range)"]
DATA["Tensor Data in Memory"]
end
LS -->|"Forward Transform<br/>idx + slice_begin"| US
US -->|"Inverse Transform<br/>idx - slice_begin"| LS
DATA -.->|"Full tensor<br/>view"| LS
DATA -.->|"Sub-region<br/>view"| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_11.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_11.svg
:alt: Diagram
:align: center
@@ -669,28 +757,36 @@ ModuloTransform applies cyclic wrapping to coordinates using modulo operations.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
..
Original mermaid diagram (edit here, then run update_diagrams.py)
graph TB
subgraph "ModuloTransform: 1D → 1D Cyclic"
LS["Lower Coordinate Space<br/>1D: [0, 3] (modulus range)"]
US["Upper Coordinate Space<br/>1D: [0, 15] (full range)"]
DATA["Tensor Data in Memory"]
end
LS -->|"Forward Transform<br/>idx * cycle_count"| US
US -->|"Inverse Transform<br/>idx % modulus"| LS
DATA -.->|" "| LS
DATA -.->|" "| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. mermaid::
graph TB
subgraph "ModuloTransform: 1D → 1D Cyclic"
LS["Lower Coordinate Space<br/>1D: [0, 3] (modulus range)"]
US["Upper Coordinate Space<br/>1D: [0, 15] (full range)"]
DATA["Tensor Data in Memory"]
end
LS -->|"Forward Transform<br/>idx * cycle_count"| US
US -->|"Inverse Transform<br/>idx % modulus"| LS
DATA -.->|" "| LS
DATA -.->|" "| US
style LS fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style US fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style DATA fill:#f0f9ff,stroke:#0284c7,stroke-width:2px,stroke-dasharray: 5 5
.. image:: diagrams/transforms_12.svg
:alt: Diagram
:align: center
.. image:: diagrams/transforms_12.svg
:alt: Diagram
:align: center