diff --git a/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py b/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py index 4b8f0f22b..f9d44e96d 100644 --- a/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py +++ b/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py @@ -647,7 +647,7 @@ class Sm100BlockScaledPersistentDenseGemmKernel: ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] - tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_dealloc_mbar: cutlass.Int64 tmem_holding_buf: cutlass.Int32 # (EPI_TILE_M, EPI_TILE_N, STAGE) sC: cute.struct.Align[ @@ -826,11 +826,11 @@ class Sm100BlockScaledPersistentDenseGemmKernel: # Tensor memory dealloc barrier init tmem = utils.TmemAllocator( - storage.tmem_holding_buf, + storage.tmem_holding_buf.ptr, barrier_for_retrieve=self.tmem_alloc_barrier, allocator_warp_id=self.epilog_warp_id[0], is_two_cta=use_2cta_instrs, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) # Cluster arrive after barrier init diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py index 1892a842c..61accba03 100644 --- a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py +++ b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py @@ -648,7 +648,7 @@ class PersistentDenseGemmKernel: acc_full_mbar_ptr: cute.struct.MemRange[ cutlass.Int64, self.num_acc_stage * 2 ] - tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_dealloc_mbar: cutlass.Int64 tmem_holding_buf: cutlass.Int32 smem = utils.SmemAllocator() @@ -699,11 +699,11 @@ class PersistentDenseGemmKernel: ) # Tensor memory dealloc barrier init tmem = utils.TmemAllocator( - storage.tmem_holding_buf, + storage.tmem_holding_buf.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=self.epilog_warp_id[0], is_two_cta=use_2cta_instrs, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) # Cluster arrive after barrier init diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py index d2e1ccff1..a1c1f9527 100644 --- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py +++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py @@ -219,11 +219,11 @@ def kernel( * len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads ) tmem = utils.TmemAllocator( - storage.tmem_holding_buffer, + storage.tmem_holding_buffer.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=epilogue_warp_ids[0], is_two_cta=True if use_2cta_instrs else False, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) # Partition tensors for TMA; This requires the tensors partitioned for MMA diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py index 9f831e961..96a816bce 100644 --- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py +++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py @@ -152,11 +152,11 @@ def kernel( * len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads ) tmem = utils.TmemAllocator( - storage.tmem_holding_buffer, + storage.tmem_holding_buffer.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=epilogue_warp_ids[0], is_two_cta=True, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) num_tma_copy_bytes = ( diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py index 553758cf7..961949e09 100644 --- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py +++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py @@ -159,11 +159,11 @@ def kernel( * len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads ) tmem = utils.TmemAllocator( - storage.tmem_holding_buffer, + storage.tmem_holding_buffer.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=epilogue_warp_ids[0], is_two_cta=True, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) num_tma_copy_bytes = ( diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py index e2d47397d..0576778cf 100644 --- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py +++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py @@ -184,11 +184,11 @@ def cluster_specific_kernel( * len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads ) tmem = utils.TmemAllocator( - storage.tmem_holding_buffer, + storage.tmem_holding_buffer.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=epilogue_warp_ids[0], is_two_cta=True, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) num_tma_copy_bytes = ( diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py index a483120af..e1f5dd750 100644 --- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py +++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py @@ -171,11 +171,11 @@ def kernel( * len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads ) tmem = utils.TmemAllocator( - storage.tmem_holding_buffer, + storage.tmem_holding_buffer.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=epilogue_warp_ids[0], is_two_cta=True, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) num_tma_copy_bytes = ( diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py index 371880ac6..f06c95444 100644 --- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py +++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py @@ -214,11 +214,11 @@ def gemm( * len((mma_warp_id, *epilogue_warp_ids)), # 5 warps = 160 threads ) tmem = utils.TmemAllocator( - storage.tmem_holding_buffer, + storage.tmem_holding_buffer.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=epilogue_warp_ids[0], is_two_cta=True, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) num_tma_copy_bytes = ( diff --git a/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py index 01a7534b8..792f6f799 100644 --- a/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py +++ b/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py @@ -756,7 +756,7 @@ class PersistentDenseGemmKernel: acc_full_mbar_ptr: cute.struct.MemRange[ cutlass.Int64, self.num_acc_stage * 2 ] - tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_dealloc_mbar: cutlass.Int64 tmem_holding_buf: cutlass.Int32 smem = utils.SmemAllocator() @@ -806,11 +806,11 @@ class PersistentDenseGemmKernel: ) # Tensor memory dealloc barrier init tmem = utils.TmemAllocator( - storage.tmem_holding_buf, + storage.tmem_holding_buf.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=self.epilog_warp_id[0], is_two_cta=use_2cta_instrs, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) # Cluster arrive after barrier init diff --git a/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py index 6fe01aaa8..ec79fbcb0 100644 --- a/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py +++ b/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py @@ -672,7 +672,7 @@ class PersistentDenseGemmKernel: acc_full_mbar_ptr: cute.struct.MemRange[ cutlass.Int64, self.num_acc_stage * 2 ] - tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_dealloc_mbar: cutlass.Int64 tmem_holding_buf: cutlass.Int32 smem = utils.SmemAllocator() @@ -723,11 +723,11 @@ class PersistentDenseGemmKernel: ) # Tensor memory dealloc barrier init tmem = utils.TmemAllocator( - storage.tmem_holding_buf, + storage.tmem_holding_buf.ptr, barrier_for_retrieve=tmem_alloc_barrier, allocator_warp_id=self.epilogue_warp_id[0], is_two_cta=use_2cta_instrs, - two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr, ) # Cluster arrive after barrier init diff --git a/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py index f196768d5..f68e3d1a5 100644 --- a/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py +++ b/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py @@ -541,7 +541,7 @@ class PersistentDenseGemmKernel: ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] - tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_dealloc_mbar: cutlass.Int64 tmem_holding_buf: cutlass.Int32 # (EPI_TILE_M, EPI_TILE_N, STAGE) sC: cute.struct.Align[ @@ -660,8 +660,8 @@ class PersistentDenseGemmKernel: smem = utils.SmemAllocator() storage = smem.allocate(self.shared_storage) - tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr - tmem_holding_buf = storage.tmem_holding_buf + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar.ptr + tmem_holding_buf = storage.tmem_holding_buf.ptr # Initialize mainloop ab_pipeline (barrier) and states ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread) diff --git a/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb b/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb index 7b39c5fea..0dd4ee992 100644 --- a/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb +++ b/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb @@ -369,7 +369,7 @@ " num_threads=threads_per_cta,\n", " )\n", " tmem = utils.TmemAllocator(\n", - " storage.tmem_holding_buf,\n", + " storage.tmem_holding_buf.ptr,\n", " barrier_for_retrieve=tmem_alloc_barrier,\n", " )\n", " num_tmem_cols = 512\n", @@ -742,7 +742,7 @@ " num_threads=threads_per_cta,\n", " )\n", " tmem = utils.TmemAllocator(\n", - " storage.tmem_holding_buf,\n", + " storage.tmem_holding_buf.ptr,\n", " barrier_for_retrieve=tmem_alloc_barrier,\n", " )\n", " num_tmem_cols = 512\n",