v4.4.2 update. (#3104)

2026-05-11 17:00:05 +00:00 · 2026-03-17 12:58:19 +08:00
parent 772fbb264e
commit 1b741cabaa
31 changed files with 996 additions and 355 deletions
--- a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
+++ b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
@@ -1655,9 +1655,11 @@ def run(
            .to(dtype=torch_dtype(c_dtype))
            .to(dtype=torch.float32)
        )
-        # Read back the result from CuTe tensor (c_storage was updated in-place)
        torch.testing.assert_close(
-            c_storage.to(dtype=torch.float32), ref, atol=tolerance, rtol=1e-03
+            c_storage.view(torch_dtype(c_dtype)).to(dtype=torch.float32),
+            ref,
+            atol=tolerance,
+            rtol=1e-03,
        )

    if not benchmark:
--- a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_dynamic.py
+++ b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_dynamic.py
@@ -1725,9 +1725,11 @@ def run(
            .to(dtype=torch_dtype(c_dtype))
            .to(dtype=torch.float32)
        )
-        # Read back the result from CuTe tensor (c_storage was updated in-place)
        torch.testing.assert_close(
-            c_storage.to(dtype=torch.float32), ref, atol=tolerance, rtol=1e-03
+            c_storage.view(torch_dtype(c_dtype)).to(dtype=torch.float32),
+            ref,
+            atol=tolerance,
+            rtol=1e-03,
        )

    if not benchmark:
--- a/examples/python/CuTeDSL/blackwell/fmha.py
+++ b/examples/python/CuTeDSL/blackwell/fmha.py
@@ -2546,8 +2546,6 @@ def run(
        slices = tuple(slice(s, e) for s, e in zip(padding, shape_))
        torch_tensor = torch_tensor_full[slices].detach()
        f32_torch_tensor = f32_torch_tensor_full[slices].detach()
-        torch_tensor._keep_alive = torch_tensor_full
-        f32_torch_tensor._keep_alive = f32_torch_tensor_full

        # Create dtype cute tensor with offset (gpu)
        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
--- a/examples/python/CuTeDSL/hopper/fmha.py
+++ b/examples/python/CuTeDSL/hopper/fmha.py
@@ -104,6 +104,67 @@ if __name__ == "__main__":

 from helpers import fmha_helpers as fmha_utils

+from cutlass.cutlass_dsl import (
+    Boolean, Int32, if_generate, while_generate, yield_out, not_, dsl_user_op,
+)
+from cutlass._mlir.dialects import nvvm
+from cutlass._mlir._mlir_libs._cutlass_ir._mlir.ir import IntegerType
+from contextlib import contextmanager
+
+
+import inspect as _inspect
+
+_timelimit_has_res = "res" in _inspect.signature(
+    nvvm.mbarrier_try_wait_parity_timelimit
+).parameters
+
+
+def _try_wait_timelimit(llvm_ptr, phase_val, timeout, *, loc=None, ip=None):
+    if _timelimit_has_res:
+        i1 = IntegerType.get_signless(1)
+        return nvvm.mbarrier_try_wait_parity_timelimit(
+            i1, llvm_ptr, phase_val, timeout, loc=loc, ip=ip,
+        )
+    return nvvm.mbarrier_try_wait_parity_timelimit(
+        llvm_ptr, phase_val, timeout, loc=loc, ip=ip,
+    )
+
+
+@dsl_user_op
+def _optimized_mbarrier_wait(mbar_ptr, phase, *, loc=None, ip=None):
+    llvm_ptr = mbar_ptr.llvm_ptr
+    phase_val = Int32(phase).ir_value(loc=loc, ip=ip)
+    _true = lambda: Boolean(True).ir_value(loc=loc, ip=ip)
+    timeout = Int32(10000000).ir_value(loc=loc, ip=ip)
+    d = Boolean(_try_wait_timelimit(llvm_ptr, phase_val, timeout, loc=loc, ip=ip))
+    d = if_generate(d, _true,
+        lambda: _try_wait_timelimit(llvm_ptr, phase_val, timeout, loc=loc, ip=ip),
+        None, [Boolean], loc=loc, ip=ip)
+    d = if_generate(d, _true,
+        lambda: _try_wait_timelimit(llvm_ptr, phase_val, timeout, loc=loc, ip=ip),
+        None, [Boolean], loc=loc, ip=ip)
+    def _fallback():
+        inner = Boolean(False).ir_value(loc=loc, ip=ip)
+        ctx = while_generate([inner], lambda x: not_(x, loc=loc, ip=ip), loc=loc, ip=ip)
+        with ctx as (_,):
+            r = Boolean(_try_wait_timelimit(
+                llvm_ptr, phase_val, timeout, loc=loc, ip=ip,
+            ))
+            yield_out([r], loc=loc, ip=ip)
+        return Boolean(True).ir_value(loc=loc, ip=ip)
+    if_generate(d, _true, _fallback, None, [Boolean], loc=loc, ip=ip)
+
+
+@contextmanager
+def _use_optimized_mbarrier_wait():
+    import cutlass.cute.arch as arch_mod
+    orig_wait = arch_mod.mbarrier_wait
+    arch_mod.mbarrier_wait = _optimized_mbarrier_wait
+    try:
+        yield
+    finally:
+        arch_mod.mbarrier_wait = orig_wait
+

 class HopperFusedMultiHeadAttentionForward:
    def __init__(
@@ -439,36 +500,37 @@ class HopperFusedMultiHeadAttentionForward:
        self.shared_storage = SharedStorage

        # Launch the kernel synchronously
-        self.kernel(
-            qk_tiled_mma,
-            pv_tiled_mma,
-            tma_atom_q,
-            tma_tensor_q,
-            tma_atom_k,
-            tma_tensor_k,
-            tma_atom_v,
-            tma_tensor_v,
-            tma_atom_o,
-            tma_tensor_o,
-            lse,
-            scale_softmax_log2,
-            scale_softmax,
-            scale_output,
-            window_size_left,
-            window_size_right,
-            q_smem_layout_staged,
-            k_smem_layout_staged,
-            v_smem_layout_staged,
-            o_smem_layout_staged,
-            self.tile_sched_params,
-        ).launch(
-            grid=grid,
-            block=[self.threads_per_cta, 1, 1],
-            cluster=self.cluster_shape_mnk,
-            smem=self.shared_storage.size_in_bytes(),
-            stream=stream,
-            min_blocks_per_mp=1,
-        )
+        with _use_optimized_mbarrier_wait():
+            self.kernel(
+                qk_tiled_mma,
+                pv_tiled_mma,
+                tma_atom_q,
+                tma_tensor_q,
+                tma_atom_k,
+                tma_tensor_k,
+                tma_atom_v,
+                tma_tensor_v,
+                tma_atom_o,
+                tma_tensor_o,
+                lse,
+                scale_softmax_log2,
+                scale_softmax,
+                scale_output,
+                window_size_left,
+                window_size_right,
+                q_smem_layout_staged,
+                k_smem_layout_staged,
+                v_smem_layout_staged,
+                o_smem_layout_staged,
+                self.tile_sched_params,
+            ).launch(
+                grid=grid,
+                block=[self.threads_per_cta, 1, 1],
+                cluster=self.cluster_shape_mnk,
+                smem=self.shared_storage.size_in_bytes(),
+                stream=stream,
+                min_blocks_per_mp=1,
+            )

    #  GPU device kernel
    @cute.kernel
--- a/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb
+++ b/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb
@@ -97,7 +97,7 @@
    "    # Step 2: Map thread index to tensor coordinates\n",
    "    # -------------------------------------------\n",
    "    # Each thread will process one element of the input tensors\n",
-    "    m, n = gA.shape  # Get tensor dimensions (M rows × N columns)\n",
+    "    m, n = gA.shape  # Get tensor dimensions (M rows \u00d7 N columns)\n",
    "\n",
    "    # Convert linear thread index to 2D coordinates:\n",
    "    # - ni: column index (0 to n-1)\n",
@@ -198,7 +198,7 @@
    "    num_threads_per_block = 256\n",
    "\n",
    "    # Get input dimensions\n",
-    "    m, n = mA.shape  # Matrix dimensions (M rows × N columns)\n",
+    "    m, n = mA.shape  # Matrix dimensions (M rows \u00d7 N columns)\n",
    "\n",
    "    # Create kernel instance\n",
    "    kernel = naive_elementwise_add_kernel(mA, mB, mC)\n",
@@ -298,7 +298,7 @@
    "   - For elementwise add:\n",
    "     * Read: 2 elements (A and B)\n",
    "     * Write: 1 element (C)\n",
-    "     * Total bytes = (2 reads + 1 write) × elements × sizeof(dtype)\n",
+    "     * Total bytes = (2 reads + 1 write) \u00d7 elements \u00d7 sizeof(dtype)\n",
    "\n",
    "Below is our benchmarking utility that measures these metrics:"
   ]
@@ -368,7 +368,7 @@
    "\n",
    "According to *Little's Law*, naive implementation has\n",
    "   - 1 element (4 bytes load + 2 bytes store) per thread\n",
-    "   - 256 threads/block × N blocks\n",
+    "   - 256 threads/block \u00d7 N blocks\n",
    "   - Limited in-flight operations\n",
    "\n",
    "In some GPUs, it's insufficient parallelism to saturate memory bandwidth.\n",
@@ -385,7 +385,35 @@
  {
   "cell_type": "markdown",
   "metadata": {},
-   "source": "## Vectorized Load and Store\n\nTo improve performance according to Little's Law, we need to increase the number\nof in-flight requests. We can do this by increasing the number of bytes handled\nin each load & store operation per thread through vectorized memory access.\n\nSince Ampere GPUs support up to 128-bit per load/store and each element is 16-bit,\nwe can load 8 elements per vectorized operation on contiguous rows.\nCuTe tiling operations make this vectorization straightforward.\n\nUsing ``tiled_tensor = cute.zipped_divide(tensor, tiler)``, we can partition the input\n``tensor`` into groups of ``tiler`` blocks. For vectorization, we specify ``tiler``\nas the block of data each thread accesses (8 contiguous elements in the same row, or ``(1,8)``).\nDifferent threads can then access different blocks by indexing into the 2nd mode of ``tiled_tensor``.\n\n```python\nmA : cute.Tensor                           # (2048,2048):(2048,1)\ngA = cute.zipped_divide(a, tiler=(1, 8))   # tiled/vectorized => ((1,8),(2048,256)):((0,1),(2048,8))\n```\n\n$\n    \\begin{array}{ccccc}\n    & ((1,8) & , & (2048,256)) & : ((0,1),(2048,8)) \\\\\n    & \\underbrace{\\phantom{(1,8)}}_{tiler} & & \\underbrace{\\phantom{(2048,256)}}_{threads} & \\\\\n    & \\text{\\scriptsize per-thread} & & \\text{\\scriptsize num of tiles}\n    \\end{array}\n$"
+   "source": [
+    "## Vectorized Load and Store\n",
+    "\n",
+    "To improve performance according to Little's Law, we need to increase the number\n",
+    "of in-flight requests. We can do this by increasing the number of bytes handled\n",
+    "in each load & store operation per thread through vectorized memory access.\n",
+    "\n",
+    "Since Ampere GPUs support up to 128-bit per load/store and each element is 16-bit,\n",
+    "we can load 8 elements per vectorized operation on contiguous rows.\n",
+    "CuTe tiling operations make this vectorization straightforward.\n",
+    "\n",
+    "Using ``tiled_tensor = cute.zipped_divide(tensor, tiler)``, we can partition the input\n",
+    "``tensor`` into groups of ``tiler`` blocks. For vectorization, we specify ``tiler``\n",
+    "as the block of data each thread accesses (8 contiguous elements in the same row, or ``(1,8)``).\n",
+    "Different threads can then access different blocks by indexing into the 2nd mode of ``tiled_tensor``.\n",
+    "\n",
+    "```python\n",
+    "mA : cute.Tensor                           # (2048,2048):(2048,1)\n",
+    "gA = cute.zipped_divide(a, tiler=(1, 8))   # tiled/vectorized => ((1,8),(2048,256)):((0,1),(2048,8))\n",
+    "```\n",
+    "\n",
+    "$\n",
+    "    \\begin{array}{ccccc}\n",
+    "    & ((1,8) & , & (2048,256)) & : ((0,1),(2048,8)) \\\\\n",
+    "    & \\underbrace{\\phantom{(1,8)}}_{tiler} & & \\underbrace{\\phantom{(2048,256)}}_{threads} & \\\\\n",
+    "    & \\text{\\scriptsize per-thread} & & \\text{\\scriptsize num of tiles}\n",
+    "    \\end{array}\n",
+    "$"
+   ]
  },
  {
   "cell_type": "code",
@@ -423,14 +451,91 @@
  {
   "cell_type": "markdown",
   "metadata": {},
-   "source": "This vectorized kernel follows a similar structure to its naive non-vectorized counterpart,\nwith one key difference: the tensor slicing pattern. By using `(None, (mi, ni))` as the slice indices,\nwe can extract a `(1,8)` sub-tensor from `gA`, `gB` and `gC` like \n\n$ gA[(None, (mi, ni))]: $\n\n$\n  \\begin{array}{ccccc}\n    Layout: & ( & (1,8)                        & , & (2048,256) & )                    & : & ((0,1),(2048,8)) & \\xrightarrow{\\text{slice}} & ((1,8)):((0,1)) \\\\\n            &   & \\underbrace{\\phantom{(1,8)}} &   & \\underbrace{\\phantom{(2048,256)}} &   & \\\\\n    Coord:  & ( & None                         & , & (mi, ni)   & )                    &   &\n  \\end{array}\n$\n\nThen tensor data can be loaded into vector via the `gA[(None, (mi, ni))].load()` method. It is equivalent to\n\n```python\nv0 = gA[(0, (mi, ni))]   # => mA[(mi, ni * 8 + 0)]\nv1 = gA[(1, (mi, ni))]   # => mA[(mi, ni * 8 + 1)]\nv2 = gA[(2, (mi, ni))]   # => mA[(mi, ni * 8 + 2)]\nv3 = gA[(3, (mi, ni))]   # => mA[(mi, ni * 8 + 3)]\nv4 = gA[(4, (mi, ni))]   # => mA[(mi, ni * 8 + 4)]\nv5 = gA[(5, (mi, ni))]   # => mA[(mi, ni * 8 + 5)]\nv6 = gA[(6, (mi, ni))]   # => mA[(mi, ni * 8 + 6)]\nv7 = gA[(7, (mi, ni))]   # => mA[(mi, ni * 8 + 7)]\n```\n\n### Assumed Alignment\n\nIn order to guide compile to use vectorized load/store, we must tell compiler to assume alignment of incoming pointer. \nIt's on users side to guarantee actual pointer at runtime meet the alignment restriction.\n\n```python\na_ = from_dlpack(a, assumed_align=16)\nb_ = from_dlpack(b, assumed_align=16)\nc_ = from_dlpack(c, assumed_align=16)\n\n# Compile kernel with alignment assumption\ncompiled_func = cute.compile(vectorized_elementwise_add, a_, b_, c_)\n```\n\nIt's worth to note that partitioned or tiled tensor could have different alignment of its base pointer because of offset\nduring sub-slice."
+   "source": [
+    "This vectorized kernel follows a similar structure to its naive non-vectorized counterpart,\n",
+    "with one key difference: the tensor slicing pattern. By using `(None, (mi, ni))` as the slice indices,\n",
+    "we can extract a `(1,8)` sub-tensor from `gA`, `gB` and `gC` like \n",
+    "\n",
+    "$ gA[(None, (mi, ni))]: $\n",
+    "\n",
+    "$\n",
+    "  \\begin{array}{ccccc}\n",
+    "    Layout: & ( & (1,8)                        & , & (2048,256) & )                    & : & ((0,1),(2048,8)) & \\xrightarrow{\\text{slice}} & ((1,8)):((0,1)) \\\\\n",
+    "            &   & \\underbrace{\\phantom{(1,8)}} &   & \\underbrace{\\phantom{(2048,256)}} &   & \\\\\n",
+    "    Coord:  & ( & None                         & , & (mi, ni)   & )                    &   &\n",
+    "  \\end{array}\n",
+    "$\n",
+    "\n",
+    "Then tensor data can be loaded into vector via the `gA[(None, (mi, ni))].load()` method. It is equivalent to\n",
+    "\n",
+    "```python\n",
+    "v0 = gA[(0, (mi, ni))]   # => mA[(mi, ni * 8 + 0)]\n",
+    "v1 = gA[(1, (mi, ni))]   # => mA[(mi, ni * 8 + 1)]\n",
+    "v2 = gA[(2, (mi, ni))]   # => mA[(mi, ni * 8 + 2)]\n",
+    "v3 = gA[(3, (mi, ni))]   # => mA[(mi, ni * 8 + 3)]\n",
+    "v4 = gA[(4, (mi, ni))]   # => mA[(mi, ni * 8 + 4)]\n",
+    "v5 = gA[(5, (mi, ni))]   # => mA[(mi, ni * 8 + 5)]\n",
+    "v6 = gA[(6, (mi, ni))]   # => mA[(mi, ni * 8 + 6)]\n",
+    "v7 = gA[(7, (mi, ni))]   # => mA[(mi, ni * 8 + 7)]\n",
+    "```\n",
+    "\n",
+    "### Assumed Alignment\n",
+    "\n",
+    "In order to guide compile to use vectorized load/store, we must tell compiler to assume alignment of incoming pointer. \n",
+    "It's on users side to guarantee actual pointer at runtime meet the alignment restriction.\n",
+    "\n",
+    "```python\n",
+    "a_ = from_dlpack(a, assumed_align=16)\n",
+    "b_ = from_dlpack(b, assumed_align=16)\n",
+    "c_ = from_dlpack(c, assumed_align=16)\n",
+    "\n",
+    "# Compile kernel with alignment assumption\n",
+    "compiled_func = cute.compile(vectorized_elementwise_add, a_, b_, c_)\n",
+    "```\n",
+    "\n",
+    "It's worth to note that partitioned or tiled tensor could have different alignment of its base pointer because of offset\n",
+    "during sub-slice."
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "@cute.jit\ndef vectorized_elementwise_add(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor):\n    threads_per_block = 256\n\n    gA = cute.zipped_divide(mA, (1, 8))\n    gB = cute.zipped_divide(mB, (1, 8))\n    gC = cute.zipped_divide(mC, (1, 8))\n\n    print(\"[DSL INFO] Tiled Tensors:\")\n    print(f\"[DSL INFO]   gA = {gA}\")\n    print(f\"[DSL INFO]   gB = {gB}\")\n    print(f\"[DSL INFO]   gC = {gC}\")\n\n    vectorized_elementwise_add_kernel(gA, gB, gC).launch(\n        grid=(cute.size(gC, mode=[1]) // threads_per_block, 1, 1),\n        block=(threads_per_block, 1, 1),\n    )\n\n\na = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\nb = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\nc = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n\na_ = from_dlpack(a, assumed_align=16)\nb_ = from_dlpack(b, assumed_align=16)\nc_ = from_dlpack(c, assumed_align=16)\n\ncompiled_func = cute.compile(vectorized_elementwise_add, a_, b_, c_)\ncompiled_func(a_, b_, c_)\n\n# verify correctness\ntorch.testing.assert_close(c, a + b)"
+   "source": [
+    "@cute.jit\n",
+    "def vectorized_elementwise_add(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor):\n",
+    "    threads_per_block = 256\n",
+    "\n",
+    "    gA = cute.zipped_divide(mA, (1, 8))\n",
+    "    gB = cute.zipped_divide(mB, (1, 8))\n",
+    "    gC = cute.zipped_divide(mC, (1, 8))\n",
+    "\n",
+    "    print(\"[DSL INFO] Tiled Tensors:\")\n",
+    "    print(f\"[DSL INFO]   gA = {gA}\")\n",
+    "    print(f\"[DSL INFO]   gB = {gB}\")\n",
+    "    print(f\"[DSL INFO]   gC = {gC}\")\n",
+    "\n",
+    "    vectorized_elementwise_add_kernel(gA, gB, gC).launch(\n",
+    "        grid=(cute.size(gC, mode=[1]) // threads_per_block, 1, 1),\n",
+    "        block=(threads_per_block, 1, 1),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "a_ = from_dlpack(a, assumed_align=16)\n",
+    "b_ = from_dlpack(b, assumed_align=16)\n",
+    "c_ = from_dlpack(c, assumed_align=16)\n",
+    "\n",
+    "compiled_func = cute.compile(vectorized_elementwise_add, a_, b_, c_)\n",
+    "compiled_func(a_, b_, c_)\n",
+    "\n",
+    "# verify correctness\n",
+    "torch.testing.assert_close(c, a + b)"
+   ]
  },
  {
   "cell_type": "code",
@@ -444,7 +549,68 @@
  {
   "cell_type": "markdown",
   "metadata": {},
-   "source": "## TV Layout\n\nBoth the naive and vectorized kernels follow a common pattern to map thread indices\nto physical addresses in two steps:\n\nStep 1: Map thread index to logical coordinates in `(M, N)`\n\n* `mi = thread_idx // n`\n* `ni = thread_idx % n`\n\nIn native version, each thread process 1 element, in this case, `mi` and `ni` is logical\ncoordinate into data tensor `mA`, `mB` and `mC`.\n\nInt vectorized version, each thread process multiple values of input and output tensor.\nlogical coordinate should be computed with both thread and value index.\n\n* `thread_idx // n`\n* `(thread_idx % n) * 8 + value_idx`\n\n\nStep 2: Map logical coordinates in `(M, N)` to physical addresses using the tensor layout\n\n* Vectorized Load\n\n```python\n    frgA = gA[(None, (mi, ni))].load()\n```\n\n* Elementwise Load (less efficient)\n\n```python\n    frgA0 = mA[(mi, ni * 8 + 0)]\n    frgA1 = mA[(mi, ni * 8 + 1)]\n    frgA2 = mA[(mi, ni * 8 + 2)]\n    frgA3 = mA[(mi, ni * 8 + 3)]\n    frgA4 = mA[(mi, ni * 8 + 4)]\n    frgA5 = mA[(mi, ni * 8 + 5)]\n    frgA6 = mA[(mi, ni * 8 + 6)]\n    frgA7 = mA[(mi, ni * 8 + 7)]\n\n    # Or use divided layout\n\n    frgA0 = gA[(0, (mi, ni))]\n    frgA1 = gA[(1, (mi, ni))]\n    frgA2 = gA[(2, (mi, ni))]\n    frgA3 = gA[(3, (mi, ni))]\n    frgA4 = gA[(4, (mi, ni))]\n    frgA5 = gA[(5, (mi, ni))]\n    frgA6 = gA[(6, (mi, ni))]\n    frgA7 = gA[(7, (mi, ni))]\n```\n\nCuTe introduces TV layout to represent this mapping from thread index and value index\n(i.e., the 8 elements loaded per thread) to the logical coordinate space of a tensor.\nBy configuring different TV layouts, we can experiment with different memory access\npatterns with minimal code changes.\n\n**Definition:** *TV Layout* is rank-2 layout which maps `(thread_index, value_index)` \nto logical coordinate of tensor.  \n\nWe always have *TV Layout* with canonical form as `(thread_domain, value_domain):(..., ...)`.\n\nWith *TV Layout*, each thread can find logical coordinates or indices of data partitioned\nto current thread."
+   "source": [
+    "## TV Layout\n",
+    "\n",
+    "Both the naive and vectorized kernels follow a common pattern to map thread indices\n",
+    "to physical addresses in two steps:\n",
+    "\n",
+    "Step 1: Map thread index to logical coordinates in `(M, N)`\n",
+    "\n",
+    "* `mi = thread_idx // n`\n",
+    "* `ni = thread_idx % n`\n",
+    "\n",
+    "In native version, each thread process 1 element, in this case, `mi` and `ni` is logical\n",
+    "coordinate into data tensor `mA`, `mB` and `mC`.\n",
+    "\n",
+    "Int vectorized version, each thread process multiple values of input and output tensor.\n",
+    "logical coordinate should be computed with both thread and value index.\n",
+    "\n",
+    "* `thread_idx // n`\n",
+    "* `(thread_idx % n) * 8 + value_idx`\n",
+    "\n",
+    "\n",
+    "Step 2: Map logical coordinates in `(M, N)` to physical addresses using the tensor layout\n",
+    "\n",
+    "* Vectorized Load\n",
+    "\n",
+    "```python\n",
+    "    frgA = gA[(None, (mi, ni))].load()\n",
+    "```\n",
+    "\n",
+    "* Elementwise Load (less efficient)\n",
+    "\n",
+    "```python\n",
+    "    frgA0 = mA[(mi, ni * 8 + 0)]\n",
+    "    frgA1 = mA[(mi, ni * 8 + 1)]\n",
+    "    frgA2 = mA[(mi, ni * 8 + 2)]\n",
+    "    frgA3 = mA[(mi, ni * 8 + 3)]\n",
+    "    frgA4 = mA[(mi, ni * 8 + 4)]\n",
+    "    frgA5 = mA[(mi, ni * 8 + 5)]\n",
+    "    frgA6 = mA[(mi, ni * 8 + 6)]\n",
+    "    frgA7 = mA[(mi, ni * 8 + 7)]\n",
+    "\n",
+    "    # Or use divided layout\n",
+    "\n",
+    "    frgA0 = gA[(0, (mi, ni))]\n",
+    "    frgA1 = gA[(1, (mi, ni))]\n",
+    "    frgA2 = gA[(2, (mi, ni))]\n",
+    "    frgA3 = gA[(3, (mi, ni))]\n",
+    "```\n",
+    "\n",
+    "CuTe introduces TV layout to represent this mapping from thread index and value index\n",
+    "(i.e., the 4 elements loaded per thread) to the logical coordinate space of a tensor.\n",
+    "By configuring different TV layouts, we can experiment with different memory access\n",
+    "patterns with minimal code changes.\n",
+    "\n",
+    "**Definition:** *TV Layout* is rank-2 layout which maps `(thread_index, value_index)` \n",
+    "to logical coordinate of tensor.  \n",
+    "\n",
+    "We always have *TV Layout* with canonical form as `(thread_domain, value_domain):(..., ...)`.\n",
+    "\n",
+    "With *TV Layout*, each thread can find logical coordinates or indices of data partitioned\n",
+    "to current thread.\n"
+   ]
  },
  {
   "cell_type": "markdown",
@@ -1057,4 +1223,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}