NVLS support for msccl++ executor (#375)

- Support mote datatype for multicast operation - Add new OP MULTI_LOAD_REDUCE_STORE to support NVLS - Modify allocSharedPhysicalCuda, which return std::shared_ptr<T> instead of std::shared_ptr<PhysicalCudaMemory> - Add Python support for allocSharedPhysicalCuda Test passed for `allreduce_nvls.json`
2026-04-20 06:49:29 +00:00 · 2024-11-19 22:43:28 -08:00
parent 3e51e9b359
commit 28a57b0610
26 changed files with 2116 additions and 212 deletions
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -7,12 +7,12 @@
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_data_types.hpp>
 #include <mscclpp/packet_device.hpp>
 #include <mscclpp/sm_channel.hpp>
 #include <mscclpp/sm_channel_device.hpp>

 #include "common.hpp"
-#include "gpu_data_types.hpp"

 template <typename To, typename From>
 __forceinline__ __device__ To bit_cast(const From& src) {
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -9,7 +9,7 @@ baseImageTable=(
    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
    ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
-    ["rocm6.2"]="rocm/rocm-terminal:6.2"
+    ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
 )

 declare -A extraLdPathTable
--- a/docs/getting-started/quickstart.md
+++ b/docs/getting-started/quickstart.md
@@ -19,6 +19,10 @@
        ```
        lsmod | grep nvidia_peermem
        ```
+    * For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via:
+        ```
+        sudo nvidia-modprobe -s -i <start:number of minors>
+        ```

 ## Build with Docker Images

--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -22,6 +22,7 @@ using CUdeviceptr = hipDeviceptr_t;
 using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
 using CUmemAllocationProp = hipMemAllocationProp;
 using CUmemAccessDesc = hipMemAccessDesc;
+using CUmemAllocationHandleType = hipMemAllocationHandleType;

 constexpr auto cudaSuccess = hipSuccess;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
@@ -86,6 +87,9 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__)
 #define cuMemMap(...) hipMemMap(__VA_ARGS__)
 #define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__)
+#define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__)
+#define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__)
+#define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__)

 #else

@@ -97,9 +101,10 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 // NVLS
 #if !defined(__HIP_PLATFORM_AMD__)
 #include <linux/version.h>
-#define USE_NVLS ((CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
+// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets
+#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
 #else  // !defined(__HIP_PLATFORM_AMD__)
-#define USE_NVLS 0
+#define CUDA_NVLS_SUPPORTED 0
 #endif  // !defined(__HIP_PLATFORM_AMD__)

 // GPU sync threads
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -16,6 +16,7 @@ using __bfloat162 = __hip_bfloat162;
 #else

 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 #if (CUDART_VERSION >= 11000)
 #include <cuda_bf16.h>
 #endif
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -9,6 +9,7 @@

 #include "errors.hpp"
 #include "gpu.hpp"
+#include "utils.hpp"

 /// Throw @ref mscclpp::CudaError if @p cmd does not return cudaSuccess.
 /// @param cmd The command to execute.
@@ -34,6 +35,19 @@

 namespace mscclpp {

+/// set memory access permission to read-write
+/// @param base Base memory pointer.
+/// @param size Size of the memory.
+inline void setReadWriteMemoryAccess(void* base, size_t size) {
+  CUmemAccessDesc accessDesc = {};
+  int deviceId;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
+}
+
 /// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
 /// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
 struct AvoidCudaGraphCaptureGuard {
@@ -53,15 +67,6 @@ struct CudaStreamWithFlags {
 template <class T>
 struct CudaDeleter;

-template <class T>
-struct PhysicalCudaMemory {
-  CUmemGenericAllocationHandle memHandle_;
-  T* devicePtr_;
-  size_t size_;
-  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t size)
-      : memHandle_(memHandle), devicePtr_(devicePtr), size_(size) {}
-};
-
 namespace detail {

 /// A wrapper of cudaMalloc that sets the allocated memory to zero.
@@ -79,46 +84,38 @@ T* cudaCalloc(size_t nelem) {
  return ptr;
 }

+#if (CUDA_NVLS_SUPPORTED)
 template <class T>
-PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
+T* cudaPhysicalCalloc(size_t nelems, size_t gran) {
  AvoidCudaGraphCaptureGuard cgcGuard;
-
  int deviceId = -1;
+  CUdevice currentDevice;
  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));

  CUmemAllocationProp prop = {};
  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = deviceId;
-#if defined(__HIP_PLATFORM_AMD__)
-  // TODO: revisit when HIP fixes this typo in the field name
-  prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-#else
-  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-#endif
+  prop.requestedHandleTypes =
+      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.location.id = currentDevice;

-  CUmemGenericAllocationHandle memHandle;
-  size_t bufferSize = sizeof(T) * nelem;
  // allocate physical memory
-  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/));
-
-  CUmemAccessDesc accessDesc = {};
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = deviceId;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUmemGenericAllocationHandle memHandle;
+  size_t nbytes = (nelems * sizeof(T) + gran - 1) / gran * gran;
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));

  T* devicePtr = nullptr;
-  // Map the device pointer
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
-  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
+  setReadWriteMemoryAccess(devicePtr, nbytes);
  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream));
-
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));

-  return new PhysicalCudaMemory<T>(memHandle, devicePtr, bufferSize);
+  return devicePtr;
 }
+#endif

 template <class T>
 T* cudaExtCalloc(size_t nelem) {
@@ -206,11 +203,15 @@ struct CudaDeleter {
 template <class T>
 struct CudaPhysicalDeleter {
  static_assert(!std::is_array_v<T>, "T must not be an array");
-  void operator()(PhysicalCudaMemory<T>* ptr) {
+  void operator()(T* ptr) {
    AvoidCudaGraphCaptureGuard cgcGuard;
-    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr->devicePtr_, ptr->size_));
-    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr->devicePtr_, ptr->size_));
-    MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_));
+    CUmemGenericAllocationHandle handle;
+    size_t size = 0;
+    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
+    MSCCLPP_CUTHROW(cuMemRelease(handle));
+    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
  }
 };

@@ -234,16 +235,46 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }

-/// Allocated physical memory on the device and returns a memory handle along with a memory handle for it.
-/// The deallocation only happens PhysicalCudaMemory goes out of scope.
+#if (CUDA_NVLS_SUPPORTED)
+static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
+  size_t gran = 0;
+  int numDevices = 0;
+  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
+
+  CUmulticastObjectProp prop = {};
+  prop.size = size;
+  // This is a dummy value, it might affect the granularity in the future
+  prop.numDevices = numDevices;
+  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.flags = 0;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
+  return gran;
+}
+#endif
+
+/// Allocates physical memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
 /// @tparam T Type of each element in the allocated memory.
 /// @param count Number of elements to allocate.
 /// @param gran the granularity of the allocation.
-/// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
+/// @return A std::shared_ptr to the allocated memory.
 template <class T>
-std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
-  return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
-                           std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
+std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isNvlsSupported()) {
+    throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
+  }
+  if (count == 0) {
+    return nullptr;
+  }
+
+  if (gran == 0) {
+    gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+  }
+  size_t nelems = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
+  return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>, std::shared_ptr<T>>(nelems, gran);
+#else
+  throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
+#endif
 }

 /// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
@@ -269,18 +300,6 @@ UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
 }

-/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
-/// The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @param gran the granularity of the allocation.
-/// @return A std::unique_ptr to the memory handle and a device pointer for that memory.
-template <class T>
-std::unique_ptr<PhysicalCudaMemory<T>> allocUniquePhysicalCuda(size_t count, size_t gran) {
-  return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
-                           std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
-}
-
 /// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
 /// @tparam T Type of each element in the allocated memory.
 /// @param count Number of elements to allocate.
@@ -349,6 +368,32 @@ UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
  return ptr;
 }

+/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
+/// The memory is zeroed out.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @param gran the granularity of the allocation.
+/// @return A std::unique_ptr to the allocated memory.
+template <class T>
+std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isNvlsSupported()) {
+    throw Error("Only suupport GPU with NVLS support", ErrorCode::InvalidUsage);
+  }
+  if (count == 0) {
+    return nullptr;
+  }
+
+  if (gran == 0) {
+    gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+  }
+  return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
+                           std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
+#else
+  throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
+#endif
+}
+
 /// Asynchronous cudaMemcpy without capture into a CUDA graph.
 /// @tparam T Type of each element in the allocated memory.
 /// @param dst Destination pointer.
--- a/include/mscclpp/nvls.hpp
+++ b/include/mscclpp/nvls.hpp
@@ -25,26 +25,26 @@ class NvlsConnection {

  struct DeviceMulticastPointer {
   private:
-    std::shared_ptr<PhysicalCudaMemory<char>> deviceMem_;
+    void* devicePtr_;
    std::shared_ptr<char> mcPtr_;
    size_t bufferSize_;

   public:
    using DeviceHandle = DeviceMulticastPointerDeviceHandle;
-    DeviceMulticastPointer(std::shared_ptr<PhysicalCudaMemory<char>> deviceMem, std::shared_ptr<char> mcPtr,
-                           size_t bufferSize)
-        : deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
+    DeviceMulticastPointer(void* devicePtr, std::shared_ptr<char> mcPtr, size_t bufferSize)
+        : devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
    DeviceHandle deviceHandle();
-    char* getDevicePtr();
+    void* getDevicePtr();

    friend class NvlsConnection;
  };

-  std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
-
-  /// The \p handle to the allocation (its lifetime is managed by the caller)
-  /// and the \p size of the allocation.
-  std::shared_ptr<char> bindAllocatedCuda(CUmemGenericAllocationHandle memHandle, size_t size);
+  /// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior
+  /// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda.
+  /// @param devicePtr
+  /// @param size
+  /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
+  DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);

  size_t getMultiCastMinGranularity();

--- a/include/mscclpp/nvls_device.hpp
+++ b/include/mscclpp/nvls_device.hpp
@@ -11,6 +11,8 @@
 #include <cuda_fp16.h>
 #endif  // defined(MSCCLPP_DEVICE_CUDA)

+#include <mscclpp/gpu_data_types.hpp>
+
 #include "device.hpp"

 namespace mscclpp {
@@ -27,7 +29,11 @@ struct DeviceMulticastPointerDeviceHandle {
 #if defined(MSCCLPP_DEVICE_CUDA)
  template <typename TValue, typename T>
  MSCCLPP_DEVICE_INLINE static void multimemLoadReduce(TValue& val, T* ptr) {
-    if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
+    if constexpr (std::is_same_v<TValue, int32_t> && std::is_same_v<T, int32_t>) {
+      asm("multimem.ld_reduce.relaxed.sys.global.add.s32 %0, [%1];" : "=r"(val) : "l"(ptr) : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint32_t> && std::is_same_v<T, uint32_t>) {
+      asm("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];" : "=r"(val) : "l"(ptr) : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.f32 {%0,%1,%2,%3}, [%4];"
          : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
          : "l"(ptr)
@@ -51,6 +57,13 @@ struct DeviceMulticastPointerDeviceHandle {
          : "memory");
    } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.f16x2 {%0}, [%1];" : "=r"(val.x) : "l"(ptr) : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, __bfloat162>) {
+      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
+          : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+          : "l"(ptr)
+          : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __bfloat162>) {
+      asm("multimem.ld_reduce.relaxed.sys.global.add.bf16x2 {%0}, [%1];" : "=r"(val.x) : "l"(ptr) : "memory");
    } else {
      static_assert(dependentFalse<T>, "Not supported type");
    }
@@ -58,7 +71,11 @@ struct DeviceMulticastPointerDeviceHandle {

  template <typename TValue, typename T>
  MSCCLPP_DEVICE_INLINE static void multimemStore(const TValue& val, T* ptr) {
-    if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
+    if constexpr (std::is_same_v<TValue, int32_t> && std::is_same_v<T, int32_t>) {
+      asm volatile("multimem.st.relaxed.sys.global.s32 [%0], %1;" ::"l"(ptr), "r"(val) : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint32_t> && std::is_same_v<T, uint32_t>) {
+      asm volatile("multimem.st.relaxed.sys.global.u32 [%0], %1;" ::"l"(ptr), "r"(val) : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
      asm volatile("multimem.st.relaxed.sys.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y),
                   "r"(val.z), "r"(val.w)
                   : "memory");
@@ -76,6 +93,12 @@ struct DeviceMulticastPointerDeviceHandle {
                   : "memory");
    } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
      asm volatile("multimem.st.relaxed.sys.global.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, __bfloat162>) {
+      asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y),
+                   "r"(val.z), "r"(val.w)
+                   : "memory");
+    } else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __bfloat162>) {
+      asm volatile("multimem.st.relaxed.sys.global.bf16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
    } else {
      static_assert(dependentFalse<T>, "Not supported type");
    }
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -26,6 +26,7 @@ from ._mscclpp import (
    PacketType,
    version,
    is_nvls_supported,
+    alloc_shared_physical_cuda,
    npkit,
 )

--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -23,6 +23,7 @@ extern void register_numa(nb::module_& m);
 extern void register_nvls(nb::module_& m);
 extern void register_executor(nb::module_& m);
 extern void register_npkit(nb::module_& m);
+extern void register_gpu_utils(nb::module_& m);

 template <typename T>
 void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
@@ -194,4 +195,5 @@ NB_MODULE(_mscclpp, m) {
  register_nvls(m);
  register_executor(m);
  register_npkit(m);
+  register_gpu_utils(m);
 }
--- a/python/mscclpp/gpu_utils_py.cpp
+++ b/python/mscclpp/gpu_utils_py.cpp
@@ -0,0 +1,30 @@
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+
+// #include <memory>
+#include <mscclpp/gpu_data_types.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+
+class PyCudaMemory {
+ public:
+  PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }
+
+  uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
+  size_t size() const { return size_; }
+
+ private:
+  std::shared_ptr<char> ptr_;
+  size_t size_;
+};
+
+void register_gpu_utils(nb::module_& m) {
+  nb::class_<PyCudaMemory>(m, "PyCudaMemory")
+      .def(nb::init<size_t>(), nb::arg("size"))
+      .def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
+      .def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
+  m.def(
+      "alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
+}
--- a/python/mscclpp/nvls_py.cpp
+++ b/python/mscclpp/nvls_py.cpp
@@ -30,7 +30,7 @@ void register_nvls(nb::module_& m) {
      });

  nb::class_<NvlsConnection>(m, "NvlsConnection")
-      .def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda)
+      .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("devicePtr"), nb::arg("size"))
      .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);

  m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,7 +1,7 @@
 import os
 import cupy as cp
 import ctypes
-from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
+from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack

@@ -443,12 +443,15 @@ class MscclppAllReduce6:
        self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
        min_gran = self.nvls_connection.get_multicast_min_granularity()
        aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
-        self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(
-            aligned_buffer_size
+        buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
+        self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
+            buffer_raw.get_ptr(), aligned_buffer_size
        )  # just using recommended size for now
        self.memory_ptr = self.nvls_mem_handle.get_device_ptr()

-        self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
+        self.cp_memory_ptr = cp.cuda.MemoryPointer(
+            cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0
+        )
        self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)

        # create a sm_channel for each remote neighbor
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -8,6 +8,8 @@ from mscclpp import (
    ExecutionPlan,
    PacketType,
    npkit,
+    alloc_shared_physical_cuda,
+    is_nvls_supported,
 )
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
@@ -125,6 +127,18 @@ def dtype_to_mscclpp_dtype(dtype):
        raise ValueError(f"Unknown data type: {dtype}")


+def allocate_buffer(nelems, dtype):
+    if is_nvls_supported:
+        buffer_raw = alloc_shared_physical_cuda(nelems * cp.dtype(dtype).itemsize)
+        buffer_ptr = cp.cuda.MemoryPointer(
+            cp.cuda.UnownedMemory(buffer_raw.get_ptr(), buffer_raw.size(), buffer_raw), 0
+        )
+        buffer = cp.ndarray(nelems, dtype=dtype, memptr=buffer_ptr)
+        return buffer
+    else:
+        return cp.zeros(nelems, dtype=dtype)
+
+
 def build_bufs(
    execution_plan_name: str,
    size: int,
@@ -144,14 +158,14 @@ def build_bufs(
        nelems_input = nelems
    nelems_output = nelems

-    result_buf = cp.zeros(nelems_output, dtype=dtype)
+    result_buf = allocate_buffer(nelems_output, dtype=dtype)
    if in_place:
        if "allgather" in execution_plan_name:
            input_buf = cp.split(result_buf, num_ranks)[rank]
        else:
            input_buf = result_buf
    else:
-        input_buf = cp.zeros(nelems_input, dtype=dtype)
+        input_buf = allocate_buffer(nelems_input, dtype=dtype)
    test_buf = cp.zeros(nelems_output, dtype=dtype)

    return input_buf, result_buf, test_buf
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -52,6 +52,8 @@ auto getOpType = [](const std::string& str) {
    return mscclpp::OperationType::TRANSFORM_TO_PACKET;
  } else if (str == "rpkt") {
    return mscclpp::OperationType::REDUCE_PACKET;
+  } else if (str == "glres") {
+    return mscclpp::OperationType::MULTI_LOAD_REDUCE_STORE;
  } else {
    throw mscclpp::Error("Invalid operation type", mscclpp::ErrorCode::ExecutorError);
  }
@@ -76,11 +78,15 @@ auto convertToChannelType = [](const std::string& str) {
    return mscclpp::ChannelType::PROXY;
  } else if (str == "none") {
    return mscclpp::ChannelType::NONE;
+  } else if (str == "nvls") {
+    return mscclpp::ChannelType::NVLS;
  } else {
    throw mscclpp::Error("Invalid channel type", mscclpp::ErrorCode::ExecutorError);
  }
 };

+std::set groupChannelType{mscclpp::ChannelType::NVLS};
+
 }  // namespace

 namespace mscclpp {
@@ -100,7 +106,7 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy
 }

 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfosByDstRank(int rank, BufferType bufferType) const {
-  auto pred = [rank, bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
+  auto pred = [bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
  return filter(this->channelInfosByDstRank.at(rank), pred);
 }

@@ -126,6 +132,8 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getUnpairedChannelInfos(int rank,
  return unpaired;
 }

+std::vector<NvlsInfo> ExecutionPlan::Impl::getNvlsInfos(int rank) const { return this->nvlsInfos.at(rank); }
+
 std::vector<int> ExecutionPlan::Impl::getConnectedPeers(int rank) const {
  std::set<int> peers;
  for (const auto& info : this->channelInfos.at(rank)) {
@@ -181,6 +189,8 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize,
  if (protocol == "LL") {
    this->isUsingPacket = true;
  }
+  this->inputSize = inputSize;
+  this->outputSize = outputSize;
  this->nThreadsPerBlock = obj.value("num_threads_per_block", 1024);
  const auto& gpus = obj["gpus"];

@@ -192,9 +202,6 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize,
    this->chunkGroups[rank] = gpu["chunkGroups"];
  }
  this->setupChannels(gpus);
-
-  this->inputSize = inputSize;
-  this->outputSize = outputSize;
  this->setupOperations(gpus, contsSrcOffset, constDstOffset);
 }

@@ -224,15 +231,24 @@ void ExecutionPlan::Impl::lightLoadExecutionPlan(size_t inputSize, size_t output
  this->setupOperations(gpus, contsSrcOffset, constDstOffset);
 }

-// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors.
-// Step 2. For each threadblock, construct a vector of channel indexes and keys.
-void ExecutionPlan::Impl::setupChannels(const json& gpus) {
-  using mapKey = std::tuple<int, BufferType, BufferType, ChannelType>;
-  std::map<mapKey, std::vector<int>> chanConnectedPeersMap;
-  for (const auto& gpu : gpus) {
-    int rank = gpu["id"];
-    std::vector<ChannelInfo> channelInfos;
-    for (const auto& channel : gpu["channels"]) {
+void ExecutionPlan::Impl::parseChannels(
+    const json& gpu, std::vector<ChannelInfo>& channelInfos, std::vector<NvlsInfo>& nvlsInfos,
+    std::map<std::tuple<int, BufferType, BufferType, ChannelType>, std::vector<int>>& chanConnectedPeersMap, int rank) {
+  for (const auto& channel : gpu["channels"]) {
+    ChannelType chanType = convertToChannelType(channel["type"]);
+
+    if (chanType == ChannelType::NVLS) {
+      NvlsInfo info;
+      info.bufferType = convertToBufferType(channel["buff"]);
+      for (const auto& group : channel["rankGroups"]) {
+        info.bufferSize = (int)group["size"] * this->getUpperBoundChunkSize(rank, this->inputSize, this->outputSize);
+        info.ranks.clear();
+        for (int rank : group["ranks"]) {
+          info.ranks.push_back(rank);
+        }
+        nvlsInfos.push_back(info);
+      }
+    } else {
      ChannelInfo info;
      info.srcBufferType = convertToBufferType(channel["srcbuff"]);
      info.dstBufferType = convertToBufferType(channel["dstbuff"]);
@@ -244,7 +260,21 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
      }
      channelInfos.push_back(info);
    }
+  }
+}
+
+// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors.
+// Step 2. For each threadblock, construct a vector of channel indexes and keys.
+void ExecutionPlan::Impl::setupChannels(const json& gpus) {
+  using mapKey = std::tuple<int, BufferType, BufferType, ChannelType>;
+  std::map<mapKey, std::vector<int>> chanConnectedPeersMap;
+  for (const auto& gpu : gpus) {
+    int rank = gpu["id"];
+    std::vector<ChannelInfo> channelInfos;
+    std::vector<NvlsInfo> nvlsInfos;
+    this->parseChannels(gpu, channelInfos, nvlsInfos, chanConnectedPeersMap, rank);
    this->channelInfos[rank] = channelInfos;
+    this->nvlsInfos[rank] = nvlsInfos;
  }

  for (const auto& [key, connectedFrom] : chanConnectedPeersMap) {
@@ -260,21 +290,30 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
  // setup threadblockChannelMap
  for (const auto& gpu : gpus) {
    int rank = gpu["id"];
-    auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
+    auto channelTypes = {ChannelType::SM, ChannelType::PROXY, ChannelType::NVLS};
    std::unordered_map<ChannelKey, std::vector<int>> channelMap;
    for (auto channelType : channelTypes) {
      const std::vector<ChannelInfo> channelInfos = this->getChannelInfos(rank, channelType);
      int index = 0;
-      for (const auto& info : channelInfos) {
-        ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType};
-        for (size_t i = 0; i < info.connectedPeers.size(); i++) {
+      if (channelType == ChannelType::NVLS) {
+        const std::vector<NvlsInfo> nvlsInfos = this->getNvlsInfos(rank);
+        for (const auto& info : nvlsInfos) {
+          ChannelKey key = {info.bufferType, info.bufferType, ChannelType::NVLS};
          channelMap[key].push_back(index++);
        }
+      } else {
+        for (const auto& info : channelInfos) {
+          ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType};
+          for (size_t i = 0; i < info.connectedPeers.size(); i++) {
+            channelMap[key].push_back(index++);
+          }
+        }
      }
    }
    int nthreadblocks = gpu["threadblocks"].size();
    this->threadblockSMChannelMap[rank].resize(nthreadblocks);
    this->threadblockProxyChannelMap[rank].resize(nthreadblocks);
+    this->threadblockNvlsChannelMap[rank].resize(nthreadblocks);
    for (const auto& threadblock : gpu["threadblocks"]) {
      for (const auto& channel : threadblock["channels"]) {
        ChannelType channelType = convertToChannelType(channel["ctype"]);
@@ -284,6 +323,8 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
            this->threadblockSMChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
          } else if (channelType == ChannelType::PROXY) {
            this->threadblockProxyChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
+          } else if (channelType == ChannelType::NVLS) {
+            this->threadblockNvlsChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
          }
        }
      }
@@ -314,6 +355,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
      int threadblockId = threadblock["id"];
      const auto& smChannels = this->threadblockSMChannelMap[rank][threadblockId];
      const auto& proxyChannels = this->threadblockProxyChannelMap[rank][threadblockId];
+      const auto& nvlsChannels = this->threadblockNvlsChannelMap[rank][threadblockId];
      for (size_t i = 0; i < smChannels.size(); i++) {
        const auto& [_, key] = smChannels[i];
        channelIndexes[key].push_back(i);
@@ -322,6 +364,10 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
        const auto& [_, key] = proxyChannels[i];
        channelIndexes[key].push_back(i);
      }
+      for (size_t i = 0; i < nvlsChannels.size(); i++) {
+        const auto& [_, key] = nvlsChannels[i];
+        channelIndexes[key].push_back(i);
+      }
      for (const auto& op : threadblock["ops"]) {
        Operation operation = {};
        std::vector<uint32_t> chunkIndexes;
@@ -330,17 +376,24 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
          operation.channelType = convertToChannelType(op["ctype"]);
        }
        if (op.contains("i_cids")) {
-          operation.nInputs = op["i_cids"].size();
-          for (int i = 0; i < operation.nInputs; i++) {
-            BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
-            BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
-            // Get the relevant channel index in rank channelInfos
-            operation.inputChannelIndexes[i] =
-                channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
-            operation.inputOffsets[i] =
-                this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["i_cids"][i]["off"]) +
-                getConstOffset(srcBufferType);
-            chunkIndexes.push_back((uint32_t)op["i_cids"][i]["off"]);
+          if (operation.channelType == mscclpp::ChannelType::NVLS) {
+            BufferType srcBufferType = convertToBufferType(op["srcbuff"]);
+            operation.nvlsInputIndex =
+                channelIndexes[{srcBufferType, srcBufferType, ChannelType::NVLS}][op["i_cids"][0]["id"]];
+            chunkIndexes.push_back((uint32_t)op["srcoff"]);
+          } else {
+            operation.nInputs = op["i_cids"].size();
+            for (int i = 0; i < operation.nInputs; i++) {
+              BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
+              BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
+              // Get the relevant channel index in rank channelInfos
+              operation.inputChannelIndexes[i] =
+                  channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
+              operation.inputOffsets[i] =
+                  this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["i_cids"][i]["off"]) +
+                  getConstOffset(srcBufferType);
+              chunkIndexes.push_back((uint32_t)op["i_cids"][i]["off"]);
+            }
          }
        }
        // will have either srcs or i_cids
@@ -357,14 +410,21 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
        if (op.contains("o_cids")) {
          operation.nOutputs = op["o_cids"].size();
          for (int i = 0; i < operation.nOutputs; i++) {
-            BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
-            BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
-            operation.outputChannelIndexes[i] =
-                channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
-            operation.outputOffsets[i] =
-                this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["o_cids"][i]["off"]) +
-                getConstOffset(dstBufferType);
-            chunkIndexes.push_back((uint32_t)op["o_cids"][i]["off"]);
+            if (operation.channelType == mscclpp::ChannelType::NVLS) {
+              BufferType dstBufferType = convertToBufferType(op["dstbuff"]);
+              operation.nvlsInputIndex =
+                  channelIndexes[{dstBufferType, dstBufferType, ChannelType::NVLS}][op["o_cids"][0]["id"]];
+              chunkIndexes.push_back((uint32_t)op["dstoff"]);
+            } else {
+              BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
+              BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
+              operation.outputChannelIndexes[i] =
+                  channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
+              operation.outputOffsets[i] =
+                  this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["o_cids"][i]["off"]) +
+                  getConstOffset(dstBufferType);
+              chunkIndexes.push_back((uint32_t)op["o_cids"][i]["off"]);
+            }
          }
        }
        // will have either dsts or o_cids
@@ -460,11 +520,19 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
  return nChunkSize;
 }

+size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
+  auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
+  uint32_t nChunks = sizePerRank.second;
+  return (sizePerRank.first + nChunks - 1) / nChunks;
+}
+
 void ExecutionPlan::Impl::reset() {
  this->operations.clear();
  this->channelInfos.clear();
+  this->nvlsInfos.clear();
  this->threadblockSMChannelMap.clear();
  this->threadblockProxyChannelMap.clear();
+  this->threadblockNvlsChannelMap.clear();
  this->inputChunks.clear();
  this->outputChunks.clear();
  this->scratchChunks.clear();
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.

 #include <mscclpp/executor.hpp>
+#include <mscclpp/nvls.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 #include <set>
@@ -23,6 +24,19 @@ struct ExecutionContextKey {
  }
 };

+void* getBuffer(BufferType type, void* sendbuff, void* recvbuff, void* scratch) {
+  switch (type) {
+    case BufferType::INPUT:
+      return sendbuff;
+    case BufferType::OUTPUT:
+      return recvbuff;
+    case BufferType::SCRATCH:
+      return scratch;
+    default:
+      throw Error("Invalid buffer type", ErrorCode::ExecutorError);
+  }
+};
+
 struct DeviceExecutionPlanKey {
  size_t inputMessageSize;
  size_t outputMessageSize;
@@ -97,11 +111,13 @@ namespace mscclpp {
 struct ExecutionContext {
  std::shared_ptr<ProxyService> proxyService;
  std::unordered_map<int, std::shared_ptr<Connection>> connections;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
  std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
  std::vector<mscclpp::SemaphoreId> proxySemaphores;
  std::vector<mscclpp::SmChannel> smChannels;
  std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
+  std::vector<mscclpp::NvlsConnection::DeviceMulticastPointer> nvlsChannels;
  std::unordered_map<DeviceExecutionPlanKey, std::vector<DeviceExecutionPlan>> deviceExecutionPlans;
  std::unordered_map<DeviceExecutionPlanKey, std::shared_ptr<char>> deviceExecutionPlansBuffers;
  std::shared_ptr<char> scratchBuffer;
@@ -152,7 +168,12 @@ struct Executor::Impl {

    ExecutionContext context;
    size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize, recvBufferSize);
-    std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
+    std::shared_ptr<char> scratchBuffer;
+    if (isNvlsSupported()) {
+      scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
+    } else {
+      scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
+    }
    context.scratchBuffer = scratchBuffer;
    context.scratchBufferSize = scratchBufferSize;
    context.proxyService = std::make_shared<ProxyService>();
@@ -160,6 +181,7 @@ struct Executor::Impl {
    this->setupConnections(context, rank, plan);
    this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
    this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
+    this->setupNvlsChannels(context, sendbuff, recvbuff, rank, plan);
    this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
    context.deviceExecutionPlansBuffers[devicePlanKey] =
        allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
@@ -202,6 +224,13 @@ struct Executor::Impl {
    for (size_t i = 0; i < connectionFutures.size(); i++) {
      context.connections[connectedPeers[i]] = connectionFutures[i].get();
    }
+
+    std::vector<NvlsInfo> nvlsInfos = plan.impl_->getNvlsInfos(rank);
+    for (const NvlsInfo& info : nvlsInfos) {
+      std::shared_ptr<NvlsConnection> nvlsConnection =
+          mscclpp::connectNvlsCollective(this->comm, info.ranks, info.bufferSize);
+      context.nvlsConnections.push_back(nvlsConnection);
+    }
  }

  void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
@@ -284,18 +313,6 @@ struct Executor::Impl {
    context.smSemaphores = std::move(smSemaphores);
    context.proxySemaphores = std::move(proxySemaphores);

-    auto getBuffer = [&](BufferType type) {
-      switch (type) {
-        case BufferType::INPUT:
-          return sendbuff;
-        case BufferType::OUTPUT:
-          return recvbuff;
-        case BufferType::SCRATCH:
-          return (void*)context.scratchBuffer.get();
-        default:
-          throw Error("Invalid buffer type", ErrorCode::ExecutorError);
-      }
-    };
    auto getBufferSize = [&](BufferType type) {
      switch (type) {
        case BufferType::INPUT:
@@ -313,7 +330,7 @@ struct Executor::Impl {
      std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
      int index = 0;
      for (ChannelInfo& info : channelInfos) {
-        void* src = getBuffer(info.srcBufferType);
+        void* src = getBuffer(info.srcBufferType, sendbuff, recvbuff, context.scratchBuffer.get());
        size_t bufferSize = getBufferSize(info.srcBufferType);
        TransportFlags transport = getTransportFlags(channelInfos, rank);
        RegisteredMemory localMemory = this->comm->registerMemory(src, bufferSize, transport);
@@ -332,6 +349,19 @@ struct Executor::Impl {
    }
  }

+  void setupNvlsChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, int rank,
+                         const ExecutionPlan& plan) {
+    std::vector<NvlsInfo> nvlsInfos = plan.impl_->getNvlsInfos(rank);
+    for (size_t i = 0; i < nvlsInfos.size(); i++) {
+      std::shared_ptr<NvlsConnection> nvlsConnection = context.nvlsConnections[i];
+      NvlsInfo info = nvlsInfos[i];
+      void* buffer = getBuffer(info.bufferType, sendbuff, recvbuff, context.scratchBuffer.get());
+      NvlsConnection::DeviceMulticastPointer deviceMulticastPointer =
+          nvlsConnection->bindAllocatedMemory((CUdeviceptr)buffer, info.bufferSize);
+      context.nvlsChannels.push_back(deviceMulticastPointer);
+    }
+  }
+
  void setupDeviceExecutionPlan(ExecutionContext& context, const DeviceExecutionPlanKey& key, int rank,
                                const ExecutionPlan& plan) {
    std::vector<DeviceExecutionPlan> deviceExecutionPlans;
@@ -349,6 +379,10 @@ struct Executor::Impl {
      for (const auto& [index, _] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
        deviceExecutionPlan.channels.proxyChannels[chanIndex++] = mscclpp::deviceHandle(context.proxyChannels[index]);
      }
+      chanIndex = 0;
+      for (const auto& [index, _] : plan.impl_->threadblockNvlsChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.nvlsChannels[chanIndex++] = mscclpp::deviceHandle(context.nvlsChannels[index]);
+      }
      for (size_t i = 0; i < ops.size(); i++) {
        deviceExecutionPlan.operations[i] = ops[i];
      }
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -4,6 +4,7 @@
 #ifndef MSCCLPP_EXECUTION_COMMON_HPP_
 #define MSCCLPP_EXECUTION_COMMON_HPP_

+#include <mscclpp/nvls.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>

@@ -24,6 +25,7 @@ enum class ChannelType : uint8_t {
  NONE,
  SM,
  PROXY,
+  NVLS,
 };

 // NOTE(chhwang): any modification here requires corresponding updates in `tools/npkit/npkit_trace_generator.py`.
@@ -46,11 +48,13 @@ enum class OperationType : uint8_t {
  REDUCE_SEND_PACKET,
  READ_REDUCE_COPY,
  READ_REDUCE_COPY_SEND,
+  MULTI_LOAD_REDUCE_STORE,
 };

 struct Channels {
  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
  mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::NvlsConnection::DeviceMulticastPointer> nvlsChannels[MAX_CHANNEL];
 };

 struct Operation {
@@ -61,12 +65,18 @@ struct Operation {
  uint8_t nInputs;
  uint8_t nOutputs;
  union {
+    // For ops which require reading from multiple remote sources
    uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+    // For ops which require reading from multiple local sources
    BufferType inputBufferType;
+    uint8_t nvlsInputIndex;
  };
  union {
+    // For ops which require writing to multiple remote destinations
    uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+    // For ops which require writing to multiple local destinations
    BufferType outputBufferType;
+    uint8_t nvlsOutputIndex;
  };
  uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
  uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
@@ -75,12 +85,12 @@ struct Operation {
  uint32_t size;
 };

-// total size = 1920 + 6400 + 4 + 4(padding) + 12(align) = 8336 bytes
+// total size = 2304 + 6400 + 4 + 12(padding) = 8720 bytes
 struct __attribute__((aligned(16))) DeviceExecutionPlan {
  uint8_t nSmChannels;                  // 1 bytes
  uint8_t nProxyChannels;               // 1 bytes
  uint16_t nOperations;                 // 2 bytes
-  Channels channels;                    // 1920 bytes
+  Channels channels;                    // 2304 bytes
  Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
 };

--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -15,7 +15,8 @@
 #include "execution_common.hpp"

 #if defined(MSCCLPP_DEVICE_COMPILE)
-#include "gpu_data_types.hpp"
+#include <mscclpp/gpu_data_types.hpp>
+#include <mscclpp/nvls_device.hpp>

 namespace {
 template <typename To, typename From>
@@ -138,6 +139,34 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) {
  return add_vectors_helper<__bfloat162>(a, b);
 }

+template <typename T>
+struct VectorType {
+  using type = T;
+  using nvls_type = T;
+  using nvls_type2 = T;
+};
+
+template <>
+struct VectorType<__half> {
+  using type = __half2;
+  using nvls_type = uint4;
+  using nvls_type2 = uint1;
+};
+
+template <>
+struct VectorType<__bfloat16> {
+  using type = __bfloat162;
+  using nvls_type = uint4;
+  using nvls_type2 = uint1;
+};
+
+template <>
+struct VectorType<float> {
+  using type = float;
+  using nvls_type = uint4;
+  using nvls_type2 = uint1;
+};
+
 }  // namespace
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)

@@ -401,6 +430,37 @@ MSCCLPP_DEVICE_INLINE void handleCopy(void* dst, void* src, uint32_t dstOffset,
  Element::copy(dstData, srcData, size, threadIdx.x, blockDim.x);
 }

+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+template <typename T>
+MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* dst, T* src, uint32_t dstOffset, uint32_t srcOffset,
+                                                      size_t size) {
+  using vectorType = typename VectorType<T>::type;
+  using nvlsType = typename VectorType<T>::nvls_type;
+  // nvls can only handle 4 bytes alignment
+  assert(size % sizeof(vectorType) == 0);
+  const size_t nInt4 = size / sizeof(nvlsType);
+  const size_t srcOffset4 = srcOffset / sizeof(nvlsType);
+  const size_t dstOffset4 = dstOffset / sizeof(nvlsType);
+  nvlsType* src4 = (nvlsType*)src;
+  nvlsType* dst4 = (nvlsType*)dst;
+  for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) {
+    nvlsType val;
+    DeviceMulticastPointerDeviceHandle::multimemLoadReduce(val, (vectorType*)(src4 + srcOffset4 + idx));
+    DeviceMulticastPointerDeviceHandle::multimemStore(val, (vectorType*)(dst4 + dstOffset4 + idx));
+  }
+  // handle rest of data
+  size_t processed = nInt4 * sizeof(nvlsType);
+  using nvlsType2 = typename VectorType<T>::nvls_type2;
+  const size_t startIdx = (srcOffset + processed) / sizeof(nvlsType2);
+  const size_t endIdx = (dstOffset + size) / sizeof(nvlsType2);
+  for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) {
+    nvlsType2 val;
+    DeviceMulticastPointerDeviceHandle::multimemLoadReduce(val, (vectorType*)src + idx);
+    DeviceMulticastPointerDeviceHandle::multimemStore(val, (vectorType*)dst + idx);
+  }
+}
+#endif
+
 template <typename T, typename PacketType = LL16Packet>
 __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
                                size_t scratchSize, DeviceExecutionPlan* plan, uint32_t flag
@@ -433,6 +493,8 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
  Operation* operations = localPlan->operations;
  DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
  DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
+  [[maybe_unused]] DeviceHandle<NvlsConnection::DeviceMulticastPointer>* nvlsChannels =
+      localPlan->channels.nvlsChannels;

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
 #if defined(MSCCLPP_DEVICE_HIP)
@@ -530,6 +592,13 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
      handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, smChannels, op.outputChannelIndexes,
                       op.outputOffsets, op.nOutputs, op.size);
    }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    else if (op.type == OperationType::MULTI_LOAD_REDUCE_STORE) {
+      T* dst = (T*)(nvlsChannels[op.nvlsOutputIndex].mcPtr);
+      T* src = (T*)(nvlsChannels[op.nvlsInputIndex].mcPtr);
+      handleMultiLoadReduceStore(dst, src, op.dstOffset, op.srcOffset, op.size);
+    }
+#endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT)
    NpKit::CollectGpuEventShm(NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT + (int)op.type, op.size, 0, NPKIT_GET_GPU_TIMESTAMP(),
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -23,6 +23,12 @@ struct ChannelKey {
           channelType == other.channelType;
  }
 };
+
+struct NvlsInfo {
+  std::vector<int> ranks;
+  size_t bufferSize;
+  BufferType bufferType;
+};
 }  // namespace mscclpp

 namespace std {
@@ -63,6 +69,7 @@ struct ExecutionPlan::Impl {
  std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
  std::vector<ChannelInfo> getChannelInfosByDstRank(int rank, BufferType bufferType) const;
  std::vector<ChannelInfo> getUnpairedChannelInfos(int rank, int worldSize, ChannelType channelType);
+  std::vector<NvlsInfo> getNvlsInfos(int rank) const;
  std::vector<int> getConnectedPeers(int rank) const;
  std::vector<BufferType> getConnectedBufferTypes(int rank) const;
  size_t getScratchBufferSize(int rank, size_t inputSize, size_t outputSize) const;
@@ -86,9 +93,12 @@ struct ExecutionPlan::Impl {
  std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
  std::unordered_map<int, std::vector<ChannelInfo>> channelInfosByDstRank;
  std::unordered_map<std::pair<int, ChannelType>, std::unordered_map<int, int>> channelCountMap;
+  // for nvls channels
+  std::unordered_map<int, std::vector<NvlsInfo>> nvlsInfos;
  // threadblockChannelMap[rank][threadblock] = [channelIndex, channelKey]
  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockSMChannelMap;
  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockProxyChannelMap;
+  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockNvlsChannelMap;
  std::unordered_map<int, uint32_t> inputChunks;
  std::unordered_map<int, uint32_t> outputChunks;
  std::unordered_map<int, uint32_t> scratchChunks;
@@ -102,6 +112,13 @@ struct ExecutionPlan::Impl {
  size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
  size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
                       const std::vector<uint32_t> offsets) const;
+  size_t getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const;
+
+  // helper functions to setup the channels
+  void parseChannels(
+      const nlohmann::json& gpu, std::vector<ChannelInfo>& channelInfos, std::vector<NvlsInfo>& nvlsInfos,
+      std::map<std::tuple<int, BufferType, BufferType, ChannelType>, std::vector<int>>& chanConnectedPeersMap,
+      int rank);
 };

 }  // namespace mscclpp
--- a/src/include/registered_memory.hpp
+++ b/src/include/registered_memory.hpp
@@ -27,6 +27,10 @@ struct TransportInfo {
      const IbMr* ibMr;
      IbMrInfo ibMrInfo;
    };
+    struct {
+      char shareableHandle[64];
+      size_t offsetFromBase;
+    };
  };
 };

@@ -39,6 +43,7 @@ struct RegisteredMemory::Impl {
  size_t size;
  uint64_t hostHash;
  uint64_t pidHash;
+  bool isCuMemMapAlloc;
  TransportFlags transports;
  std::vector<TransportInfo> transportInfos;

--- a/src/nvls.cc
+++ b/src/nvls.cc
@@ -15,7 +15,7 @@

 namespace mscclpp {

-#if (USE_NVLS)
+#if (CUDA_NVLS_SUPPORTED)
 class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
 public:
  // use this only for the root of the NVLS
@@ -31,10 +31,11 @@ class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection:
  void addDevice(int cudaDeviceId);
  size_t allocateBuffer(size_t size);
  void freeBuffer(size_t offset, size_t size) noexcept;
-  std::shared_ptr<char> bindMemory(CUmemGenericAllocationHandle memHandle, size_t devBuffSize);
+  std::shared_ptr<char> bindMemory(CUdeviceptr devicePtr, size_t devBuffSize);

 private:
  friend class NvlsConnection;
+
  CUmemGenericAllocationHandle mcHandle_;
  CUmulticastObjectProp mcProp_;
  size_t bufferSize_;
@@ -70,8 +71,10 @@ NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
    throw mscclpp::SysError("getpid() failed", errno);
  }

-  INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n",
-       mcProp_.size, minMcGran_, mcGran_);
+  INFO(MSCCLPP_COLL,
+       "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld buffer size is "
+       "%ld, adjusted size is %ld",
+       mcProp_.size, minMcGran_, mcGran_, bufferSize, bufferSize_);
 }

 NvlsConnection::Impl::Impl(const std::vector<char>& data) {
@@ -128,6 +131,8 @@ void NvlsConnection::Impl::addDevice(int cudaDeviceId) {
  INFO(MSCCLPP_COLL, "NVLS connection created");
 }

+// TODO(binyli): For cuMemMap, we can not map handle to va with offset not equal to 0.
+// Then we don't need to maintain the freeRanges_ list. For different memory, we could map to different mc handle.
 size_t NvlsConnection::Impl::allocateBuffer(size_t size) {
  if (freeRanges_.empty()) {
    throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage);
@@ -187,24 +192,21 @@ void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept {
  }
 }

-std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUmemGenericAllocationHandle memHandle, size_t devBuffSize) {
+std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUdeviceptr devicePtr, size_t devBuffSize) {
+  devBuffSize = ((devBuffSize + minMcGran_ - 1) / minMcGran_) * minMcGran_;
  size_t offset = allocateBuffer(devBuffSize);
-  MSCCLPP_CUTHROW(cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, memHandle, 0 /*memOffset*/, devBuffSize, 0));
+  MSCCLPP_CUTHROW(cuMulticastBindAddr(mcHandle_, offset /*mcOffset*/, devicePtr, devBuffSize, 0));

  char* mcPtr;
-
-  CUmemAccessDesc accessDesc = {};
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  int deviceId = -1;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
-  accessDesc.location.id = deviceId;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
-  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
+  setReadWriteMemoryAccess(mcPtr, devBuffSize);
+  INFO(MSCCLPP_COLL, "NVLS connection bound memory at offset %ld, size %ld", offset, devBuffSize);

  auto deleter = [=, self = shared_from_this()](char* ptr) {
+    int deviceId;
    CUdevice device;
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
    MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
@@ -214,7 +216,8 @@ std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUmemGenericAllocationHan

  return std::shared_ptr<char>(mcPtr, deleter);
 }
-#else   // !(USE_NVLS)
+
+#else   // !(CUDA_NVLS_SUPPORTED)
 class NvlsConnection::Impl {
 public:
  // use this only for the root of the NVLS
@@ -227,15 +230,15 @@ class NvlsConnection::Impl {
  std::vector<char> serialize() { throw notSupportedError; }
  size_t allocateBuffer(size_t) { throw notSupportedError; }
  void freeBuffer(size_t, size_t) { throw notSupportedError; }
-  std::shared_ptr<char> bindMemory(CUmemGenericAllocationHandle, size_t) { throw notSupportedError; }
+  std::shared_ptr<char> bindMemory(CUdeviceptr, size_t) { throw notSupportedError; }
  void addDevice(int) { throw notSupportedError; }
  size_t getMinMcGran() { throw notSupportedError; }

 private:
  Error notSupportedError =
-      Error("NVLS is not supported on this CUDA version (< 12.1) or kernel version (< 5.6.0)", ErrorCode::InvalidUsage);
+      Error("NVLS is not supported on this CUDA version (< 12.3) or kernel version (< 5.6.0)", ErrorCode::InvalidUsage);
 };
-#endif  // !(USE_NVLS)
+#endif  // !(CUDA_NVLS_SUPPORTED)

 const int NvlsConnection::DefaultNvlsBufferSize = (1 << 29);

@@ -254,25 +257,20 @@ NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make

 std::vector<char> NvlsConnection::serialize() { return pimpl_->serialize(); }

-std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
-  auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->getMinMcGran());
-  auto mcPtr = pimpl_->bindMemory(mem->memHandle_, size);
-  return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
-}
-
-std::shared_ptr<char> NvlsConnection::bindAllocatedCuda(CUmemGenericAllocationHandle memHandle, size_t size) {
-  return pimpl_->bindMemory(memHandle, size);
+NvlsConnection::DeviceMulticastPointer NvlsConnection::bindAllocatedMemory(CUdeviceptr devicePtr, size_t size) {
+  auto mcPtr = pimpl_->bindMemory(devicePtr, size);
+  return DeviceMulticastPointer((void*)devicePtr, mcPtr, size);
 }

 NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() {
  NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
-  device.devicePtr = this->deviceMem_->devicePtr_;
+  device.devicePtr = this->devicePtr_;
  device.mcPtr = this->mcPtr_.get();
  device.bufferSize = this->bufferSize_;
  return device;
 };

-char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; };
+void* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return devicePtr_; };

 size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->getMinMcGran(); }

--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -11,6 +11,65 @@
 #include "debug.h"
 #include "utils_internal.hpp"

+#define MSCCLPP_CULOG_WARN(cmd)                             \
+  do {                                                      \
+    CUresult err = cmd;                                     \
+    if (err != CUDA_SUCCESS) {                              \
+      const char* errStr;                                   \
+      if (cuGetErrorString(err, &errStr) != CUDA_SUCCESS) { \
+        errStr = "failed to get error string";              \
+      }                                                     \
+      WARN("Call to " #cmd " failed, error is %s", errStr); \
+    }                                                       \
+  } while (false)
+
+namespace {
+// Get the recommended granularity for cuMemAddressReserve
+size_t getRecommendedGranularity() {
+#if (CUDA_NVLS_SUPPORTED)
+  size_t gran = 0;
+  int deviceId = -1;
+  int currentDevice = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes =
+      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.location.id = currentDevice;
+  MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  return gran;
+#else
+  throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
+#endif
+}
+
+CUmemAllocationHandleType getNvlsCompatibleMemHandleType() {
+#if (CUDA_NVLS_SUPPORTED)
+  return CU_MEM_HANDLE_TYPE_FABRIC;
+#else
+  throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
+#endif
+}
+
+// Check if ptr is allocaed by cuMemMap
+bool isCuMemMapAllocated(void* ptr) {
+  CUmemGenericAllocationHandle handle;
+  CUresult result = cuMemRetainAllocationHandle(&handle, ptr);
+  if (result != CUDA_SUCCESS) {
+    return false;
+  }
+  MSCCLPP_CUTHROW(cuMemRelease(handle));
+  if (!mscclpp::isNvlsSupported()) {
+    throw mscclpp::Error("cuMemMap is used in env without NVLS support", mscclpp::ErrorCode::InvalidUsage);
+  }
+  return true;
+}
+
+}  // namespace
+
 namespace mscclpp {

 RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl)
@@ -23,15 +82,24 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
  if (transports.has(Transport::CudaIpc)) {
    TransportInfo transportInfo;
    transportInfo.transport = Transport::CudaIpc;
-    cudaIpcMemHandle_t handle;

    void* baseDataPtr;
    size_t baseDataSize;  // dummy
    MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data));
-    MSCCLPP_CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
-    // TODO: bug with offset of base?
-    transportInfo.cudaIpcBaseHandle = handle;
-    transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
+    this->isCuMemMapAlloc = isCuMemMapAllocated(baseDataPtr);
+    if (this->isCuMemMapAlloc) {
+      CUmemGenericAllocationHandle handle;
+      MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr));
+      MSCCLPP_CUTHROW(
+          cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0));
+      transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr;
+    } else {
+      cudaIpcMemHandle_t handle;
+      MSCCLPP_CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
+      // TODO: bug with offset of base?
+      transportInfo.cudaIpcBaseHandle = handle;
+      transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
+    }
    this->transportInfos.push_back(transportInfo);
  }
  if ((transports & AllIBTransports).any()) {
@@ -75,6 +143,8 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
  std::copy_n(reinterpret_cast<char*>(&pimpl_->size), sizeof(pimpl_->size), std::back_inserter(result));
  std::copy_n(reinterpret_cast<char*>(&pimpl_->hostHash), sizeof(pimpl_->hostHash), std::back_inserter(result));
  std::copy_n(reinterpret_cast<char*>(&pimpl_->pidHash), sizeof(pimpl_->pidHash), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->isCuMemMapAlloc), sizeof(pimpl_->isCuMemMapAlloc),
+              std::back_inserter(result));
  std::copy_n(reinterpret_cast<char*>(&pimpl_->transports), sizeof(pimpl_->transports), std::back_inserter(result));
  if (pimpl_->transportInfos.size() > static_cast<size_t>(std::numeric_limits<int8_t>::max())) {
    throw mscclpp::Error("Too many transport info entries", ErrorCode::InternalError);
@@ -84,10 +154,17 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
  for (auto& entry : pimpl_->transportInfos) {
    std::copy_n(reinterpret_cast<char*>(&entry.transport), sizeof(entry.transport), std::back_inserter(result));
    if (entry.transport == Transport::CudaIpc) {
-      std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcBaseHandle), sizeof(entry.cudaIpcBaseHandle),
-                  std::back_inserter(result));
-      std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcOffsetFromBase), sizeof(entry.cudaIpcOffsetFromBase),
-                  std::back_inserter(result));
+      if (pimpl_->isCuMemMapAlloc) {
+        std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
+                    std::back_inserter(result));
+        std::copy_n(reinterpret_cast<char*>(&entry.offsetFromBase), sizeof(entry.offsetFromBase),
+                    std::back_inserter(result));
+      } else {
+        std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcBaseHandle), sizeof(entry.cudaIpcBaseHandle),
+                    std::back_inserter(result));
+        std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcOffsetFromBase), sizeof(entry.cudaIpcOffsetFromBase),
+                    std::back_inserter(result));
+      }
    } else if (AllIBTransports.has(entry.transport)) {
      std::copy_n(reinterpret_cast<char*>(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result));
    } else {
@@ -111,6 +188,8 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
  it += sizeof(this->hostHash);
  std::copy_n(it, sizeof(this->pidHash), reinterpret_cast<char*>(&this->pidHash));
  it += sizeof(this->pidHash);
+  std::copy_n(it, sizeof(this->isCuMemMapAlloc), reinterpret_cast<char*>(&this->isCuMemMapAlloc));
+  it += sizeof(this->isCuMemMapAlloc);
  std::copy_n(it, sizeof(this->transports), reinterpret_cast<char*>(&this->transports));
  it += sizeof(this->transports);
  int8_t transportCount;
@@ -121,12 +200,19 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
    std::copy_n(it, sizeof(transportInfo.transport), reinterpret_cast<char*>(&transportInfo.transport));
    it += sizeof(transportInfo.transport);
    if (transportInfo.transport == Transport::CudaIpc) {
-      std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle),
-                  reinterpret_cast<char*>(&transportInfo.cudaIpcBaseHandle));
-      it += sizeof(transportInfo.cudaIpcBaseHandle);
-      std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase),
-                  reinterpret_cast<char*>(&transportInfo.cudaIpcOffsetFromBase));
-      it += sizeof(transportInfo.cudaIpcOffsetFromBase);
+      if (this->isCuMemMapAlloc) {
+        std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast<char*>(&transportInfo.shareableHandle));
+        it += sizeof(transportInfo.shareableHandle);
+        std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast<char*>(&transportInfo.offsetFromBase));
+        it += sizeof(transportInfo.offsetFromBase);
+      } else {
+        std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle),
+                    reinterpret_cast<char*>(&transportInfo.cudaIpcBaseHandle));
+        it += sizeof(transportInfo.cudaIpcBaseHandle);
+        std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase),
+                    reinterpret_cast<char*>(&transportInfo.cudaIpcOffsetFromBase));
+        it += sizeof(transportInfo.cudaIpcOffsetFromBase);
+      }
    } else if (AllIBTransports.has(transportInfo.transport)) {
      std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast<char*>(&transportInfo.ibMrInfo));
      it += sizeof(transportInfo.ibMrInfo);
@@ -148,8 +234,18 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
    // The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle
    auto entry = getTransportInfo(Transport::CudaIpc);
    void* base;
-    MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
-    this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
+    if (this->isCuMemMapAlloc) {
+      CUmemGenericAllocationHandle handle;
+      MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType()));
+      size_t gran = getRecommendedGranularity();
+      MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
+      MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
+      setReadWriteMemoryAccess(base, this->size);
+      this->data = static_cast<char*>(base) + entry.offsetFromBase;
+    } else {
+      MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
+      this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
+    }
    INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data);
  } else {
    // No valid data pointer can be set
@@ -161,11 +257,22 @@ RegisteredMemory::Impl::~Impl() {
  // Close the CUDA IPC handle if it was opened during deserialization
  if (data && transports.has(Transport::CudaIpc) && getHostHash() == this->hostHash && getPidHash() != this->pidHash) {
    void* base = static_cast<char*>(data) - getTransportInfo(Transport::CudaIpc).cudaIpcOffsetFromBase;
-    cudaError_t err = cudaIpcCloseMemHandle(base);
-    if (err != cudaSuccess) {
-      WARN("Failed to close CUDA IPC handle at pointer %p: %s", base, cudaGetErrorString(err));
+    if (this->isCuMemMapAlloc) {
+      CUmemGenericAllocationHandle handle;
+      size_t size = 0;
+      MSCCLPP_CULOG_WARN(cuMemRetainAllocationHandle(&handle, base));
+      MSCCLPP_CULOG_WARN(cuMemRelease(handle));
+      MSCCLPP_CULOG_WARN(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)base));
+      MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size));
+      MSCCLPP_CULOG_WARN(cuMemRelease(handle));
+      MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size));
    } else {
-      INFO(MSCCLPP_P2P, "Closed CUDA IPC handle at pointer %p", base);
+      cudaError_t err = cudaIpcCloseMemHandle(base);
+      if (err != cudaSuccess) {
+        WARN("Failed to close CUDA IPC handle at pointer %p: %s", base, cudaGetErrorString(err));
+      } else {
+        INFO(MSCCLPP_P2P, "Closed CUDA IPC handle at pointer %p", base);
+      }
    }
    data = nullptr;
  }
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -68,12 +68,19 @@ std::string getHostName(int maxlen, const char delim) {
 }

 bool isNvlsSupported() {
-#if (CUDART_VERSION >= 12010)
-  CUdevice dev;
-  int isNvlsSupported;
-  MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
-  MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isNvlsSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-  return isNvlsSupported == 1;
+  [[maybe_unused]] static bool result = false;
+  [[maybe_unused]] static bool isChecked = false;
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isChecked) {
+    int isMulticastSupported;
+    int isFabricSupported;
+    CUdevice dev;
+    MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
+    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
+    result = (isMulticastSupported == 1 && isFabricSupported == 1);
+  }
+  return result;
 #endif
  return false;
 }
--- a/test/execution-files/allreduce_nvls.json
+++ b/test/execution-files/allreduce_nvls.json
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -131,7 +131,11 @@ int main(int argc, char* argv[]) {
  }

  mscclpp::ExecutionPlan plan(executionPlanName, executionPlanPath);
+#if (CUDA_NVLS_SUPPORTED)
+  std::shared_ptr<char> sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
+#else
  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
+#endif
  std::vector<int> dataHost(bufferSize / sizeof(int), rank);
  MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
  double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);
--- a/test/nvls_test.cu
+++ b/test/nvls_test.cu
@@ -1,18 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.

-#include <stdio.h>
-
-#if (USE_NVLS)
-#include <cuda.h>
-#include <cudaTypedefs.h>
-#include <cuda_runtime.h>
 #include <mpi.h>
+#include <stdio.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include <mscclpp/gpu.hpp>
+#if (CUDA_NVLS_SUPPORTED)
+#include <cuda.h>
+#include <cudaTypedefs.h>
+#include <cuda_runtime.h>

 #define CUCHECK(cmd)                                     \
  do {                                                   \
@@ -41,31 +40,31 @@
 #define MULTIMEM_LD(val, ptr)
 #endif

-__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) {
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+__global__ void init_kernel(float* uc_ptr, size_t size, int myrank, int nranks) {
+  for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
    uc_ptr[idx] = myrank + idx;
  }
 }

-__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) {
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+__global__ void check_correctness(float* uc_ptr, size_t size, int myrank, int nranks) {
+  for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
    float expected = (float)((nranks * (nranks - 1)) / 2 + nranks * idx);
    if (abs(uc_ptr[idx] - expected) > 0.01 * expected) {
-      printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected);
+      printf("error! idx %ld: %f != %f\n", idx, uc_ptr[idx], expected);
    }
  }
 }

-__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
+__global__ void testing(float* mc_ptr, size_t size, int myrank, int nranks) {
  // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
  // line is assumed to be 16B 4 ints of 8 halves
-  int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
-  int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
+  size_t my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
+  size_t my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;

-  int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
-  int my_step = blockDim.x * gridDim.x * 4;
+  size_t my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
+  size_t my_step = blockDim.x * gridDim.x * 4;

-  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+  for (size_t idx = my_st + my_offset; idx < my_en; idx += my_step) {
    [[maybe_unused]] uint4 val;
    MULTIMEM_LD(val, mc_ptr + idx);
    MULTIMEM_ST(val, mc_ptr + idx);
@@ -80,7 +79,7 @@ int main() {

  cudaSetDevice(myrank);

-  size_t size = 1024 * 1024 * 512;
+  size_t size = 1024ULL * 1024ULL * 512ULL * 16;
  CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;

  CUmulticastObjectProp mcProp = {};
@@ -138,7 +137,7 @@ int main() {
  prop.requestedHandleTypes = handleType;

  // allocate physical memory (data buffer)
-  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
+  CUCHECK(cuMemCreate(&memhandle, mcSize, &prop, 0 /*flags*/));

  void* uc_va;
  void* mc_va;
@@ -148,14 +147,14 @@ int main() {
  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

  // Map a VA to UC space
-  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
-  cudaMemset(uc_va, 0, size);
-  CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, mcSize, minGran, 0U, 0));
+  cudaMemset(uc_va, 0, mcSize);
+  CUCHECK(cuMemMap((CUdeviceptr)uc_va, mcSize, 0, memhandle, 0));
  // set access on UC address
-  CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
+  CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, mcSize, &accessDesc, 1));

  // everyone binds memory to the multicast
-  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
+  CUCHECK(cuMulticastBindAddr(handle, 0 /*mcOffset*/, (CUdeviceptr)uc_va, mcSize, 0));
  MPI_Barrier(MPI_COMM_WORLD);
  // usual VA business: map both MC and PA to two different VA addresses

@@ -203,11 +202,11 @@ int main() {
  return 0;
 }

-#else  // !(USE_NVLS)
+#else  // !(CUDA_NVLS_SUPPORTED)

 int main() {
  printf("This test requires NVLS to be enabled\n");
  return 0;
 }

-#endif  // !(USE_NVLS)
+#endif  // !(CUDA_NVLS_SUPPORTED)