diff --git a/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py b/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py
index 4b8f0f22b..f9d44e96d 100644
--- a/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py
+++ b/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent_prefetch.py
@@ -647,7 +647,7 @@ class Sm100BlockScaledPersistentDenseGemmKernel:
             ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
             acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
             acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sC: cute.struct.Align[
@@ -826,11 +826,11 @@ class Sm100BlockScaledPersistentDenseGemmKernel:
 
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=self.tmem_alloc_barrier,
             allocator_warp_id=self.epilog_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py
index 1892a842c..61accba03 100644
--- a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py
+++ b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent_prefetch.py
@@ -648,7 +648,7 @@ class PersistentDenseGemmKernel:
             acc_full_mbar_ptr: cute.struct.MemRange[
                 cutlass.Int64, self.num_acc_stage * 2
             ]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
 
         smem = utils.SmemAllocator()
@@ -699,11 +699,11 @@ class PersistentDenseGemmKernel:
             )
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=tmem_alloc_barrier,
             allocator_warp_id=self.epilog_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py
index d2e1ccff1..a1c1f9527 100644
--- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py
+++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_2.py
@@ -219,11 +219,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True if use_2cta_instrs else False,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     # Partition tensors for TMA; This requires the tensors partitioned for MMA
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py
index 9f831e961..96a816bce 100644
--- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py
+++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3.py
@@ -152,11 +152,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py
index 553758cf7..961949e09 100644
--- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py
+++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_3_1.py
@@ -159,11 +159,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py
index e2d47397d..0576778cf 100644
--- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py
+++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_4.py
@@ -184,11 +184,11 @@ def cluster_specific_kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py
index a483120af..e1f5dd750 100644
--- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py
+++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_5.py
@@ -171,11 +171,11 @@ def kernel(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py
index 371880ac6..f06c95444 100644
--- a/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py
+++ b/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_6.py
@@ -214,11 +214,11 @@ def gemm(
         * len((mma_warp_id, *epilogue_warp_ids)),  # 5 warps = 160 threads
     )
     tmem = utils.TmemAllocator(
-        storage.tmem_holding_buffer,
+        storage.tmem_holding_buffer.ptr,
         barrier_for_retrieve=tmem_alloc_barrier,
         allocator_warp_id=epilogue_warp_ids[0],
         is_two_cta=True,
-        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar,
+        two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
     )
 
     num_tma_copy_bytes = (
diff --git a/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py
index 01a7534b8..792f6f799 100644
--- a/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py
+++ b/examples/python/CuTeDSL/distributed/distributed_all_gather_gemm_blackwell.py
@@ -756,7 +756,7 @@ class PersistentDenseGemmKernel:
             acc_full_mbar_ptr: cute.struct.MemRange[
                 cutlass.Int64, self.num_acc_stage * 2
             ]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
 
         smem = utils.SmemAllocator()
@@ -806,11 +806,11 @@ class PersistentDenseGemmKernel:
             )
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=tmem_alloc_barrier,
             allocator_warp_id=self.epilog_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py
index 6fe01aaa8..ec79fbcb0 100644
--- a/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py
+++ b/examples/python/CuTeDSL/distributed/distributed_gemm_all_reduce_blackwell.py
@@ -672,7 +672,7 @@ class PersistentDenseGemmKernel:
             acc_full_mbar_ptr: cute.struct.MemRange[
                 cutlass.Int64, self.num_acc_stage * 2
             ]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
 
         smem = utils.SmemAllocator()
@@ -723,11 +723,11 @@ class PersistentDenseGemmKernel:
             )
         # Tensor memory dealloc barrier init
         tmem = utils.TmemAllocator(
-            storage.tmem_holding_buf,
+            storage.tmem_holding_buf.ptr,
             barrier_for_retrieve=tmem_alloc_barrier,
             allocator_warp_id=self.epilogue_warp_id[0],
             is_two_cta=use_2cta_instrs,
-            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar.ptr,
         )
 
         # Cluster arrive after barrier init
diff --git a/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py b/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py
index f196768d5..f68e3d1a5 100644
--- a/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py
+++ b/examples/python/CuTeDSL/distributed/distributed_gemm_reduce_scatter_blackwell.py
@@ -541,7 +541,7 @@ class PersistentDenseGemmKernel:
             ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
             acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
             acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
-            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_dealloc_mbar: cutlass.Int64
             tmem_holding_buf: cutlass.Int32
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sC: cute.struct.Align[
@@ -660,8 +660,8 @@ class PersistentDenseGemmKernel:
         smem = utils.SmemAllocator()
         storage = smem.allocate(self.shared_storage)
 
-        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
-        tmem_holding_buf = storage.tmem_holding_buf
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar.ptr
+        tmem_holding_buf = storage.tmem_holding_buf.ptr
 
         # Initialize mainloop ab_pipeline (barrier) and states
         ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
diff --git a/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb b/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb
index 7b39c5fea..0dd4ee992 100644
--- a/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb
+++ b/examples/python/CuTeDSL/notebooks/tour_to_sol_gemm.ipynb
@@ -369,7 +369,7 @@
     "        num_threads=threads_per_cta,\n",
     "    )\n",
     "    tmem = utils.TmemAllocator(\n",
-    "        storage.tmem_holding_buf,\n",
+    "        storage.tmem_holding_buf.ptr,\n",
     "        barrier_for_retrieve=tmem_alloc_barrier,\n",
     "    )\n",
     "    num_tmem_cols = 512\n",
@@ -742,7 +742,7 @@
     "        num_threads=threads_per_cta,\n",
     "    )\n",
     "    tmem = utils.TmemAllocator(\n",
-    "        storage.tmem_holding_buf,\n",
+    "        storage.tmem_holding_buf.ptr,\n",
     "        barrier_for_retrieve=tmem_alloc_barrier,\n",
     "    )\n",
     "    num_tmem_cols = 512\n",