diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index 86c57b43..6f583ae7 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -166,6 +166,8 @@ class MscclppAlltoAllV:
         self._cached_output_size = 0
         self._cached_total_output_elems = 0
         self._cached_dtype = None
+        # One-time check for untyped_storage (available since PyTorch 1.13)
+        self._has_untyped_storage = hasattr(torch.Tensor, 'untyped_storage')
         # Pre-built extras dict (GPU pointers don't change)
         self._extras = {
             "sendCounts": self._d_send_counts.data_ptr(),
@@ -293,8 +295,12 @@ class MscclppAlltoAllV:
         # Use the full underlying storage size for context key stability.
         # When the test reuses the same large tensor with different split sizes,
         # storage size stays constant → same context key → reuses channels.
-        input_alloc_size = input.untyped_storage().size()
-        output_alloc_size = output.untyped_storage().size()
+        if self._has_untyped_storage:
+            input_alloc_size = input.untyped_storage().size()
+            output_alloc_size = output.untyped_storage().size()
+        else:
+            input_alloc_size = input.nelement() * input.element_size()
+            output_alloc_size = output.nelement() * output.element_size()
 
         if _DEBUG:
             # Clear stale CUDA errors (the C++ code checks cudaGetLastError
@@ -304,6 +310,7 @@ class MscclppAlltoAllV:
                 if _last_err != 0:
                     print(f"  [rank {self._rank}] WARNING: cleared stale CUDA error code {_last_err} before execute", flush=True)
             print(f"  [rank {self._rank}] alltoallv: calling algo.execute(input_alloc={input_alloc_size}, output_alloc={output_alloc_size})", flush=True)
+
         result = self._algo.execute(
             self._comm,
             input.data_ptr(),
@@ -318,6 +325,7 @@ class MscclppAlltoAllV:
             0,     # nthreads_per_block (auto)
             self._extras,
         )
+
         if _DEBUG:
             print(f"  [rank {self._rank}] alltoallv: algo.execute returned {result}", flush=True)