mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Use smart pointer for IB structure (#585)
Change to use smart pointer for IB structure. Registered memory will own ibMr, ibCtx will not held the reference - Use smart pointer for IbQp and IbMr - Update memoryChannel API, keep localRegisteredMemory - Close fd when registedMemory released --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
This commit is contained in:
@@ -117,7 +117,10 @@ class MscclppAllReduce2:
|
||||
|
||||
self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
|
||||
# create a memory_channel for each remote neighbor
|
||||
self.memory_channels = self.group.make_memory_channels_with_scratch(self.memory, self.scratch, self.connections)
|
||||
self.registered_scratch = self.group.register_local_memory(self.scratch, self.connections)
|
||||
self.memory_channels = self.group.make_memory_channels_with_scratch(
|
||||
self.memory, self.registered_scratch, self.connections
|
||||
)
|
||||
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.kernel = KernelBuilder(
|
||||
file="allreduce.cu", kernel_name="allreduce2", file_dir=file_dir, macro_dict={"TYPE": type_str}
|
||||
@@ -182,8 +185,9 @@ class MscclppAllReduce3:
|
||||
self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
|
||||
|
||||
# create a memory_channel for each remote neighbor
|
||||
self.registered_scratch = self.group.register_local_memory(self.scratch, self.connections)
|
||||
self.fst_round_port_chans = self.group.make_port_channels_with_scratch(
|
||||
self.proxy_service, self.memory, self.scratch, self.connections
|
||||
self.proxy_service, self.memory, self.registered_scratch, self.connections
|
||||
)
|
||||
self.snd_round_port_chans = self.group.make_port_channels(self.proxy_service, self.memory, self.connections)
|
||||
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -263,8 +267,9 @@ class MscclppAllReduce4:
|
||||
same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
|
||||
# create a memory_channel for each remote neighbor
|
||||
self.memory_channels = self.group.make_memory_channels(self.memory, same_node_connections)
|
||||
self.registered_scratch = self.group.register_local_memory(self.scratch, self.connections)
|
||||
self.reduce_scatter_port_channels = self.group.make_port_channels_with_scratch(
|
||||
self.proxy_service, self.memory, self.scratch, self.connections
|
||||
self.proxy_service, self.memory, self.registered_scratch, self.connections
|
||||
)
|
||||
self.all_gather_port_channels = self.group.make_port_channels(self.proxy_service, self.memory, self.connections)
|
||||
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -365,11 +370,12 @@ class MscclppAllReduce5:
|
||||
same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
|
||||
across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)}
|
||||
# create a memory_channel for each remote neighbor
|
||||
self.registered_scratch = self.group.register_local_memory(self.scratch, self.connections)
|
||||
self.memory_channels = self.group.make_memory_channels_with_scratch(
|
||||
self.memory, self.scratch, same_node_connections
|
||||
self.memory, self.registered_scratch, same_node_connections
|
||||
)
|
||||
self.port_channels = self.group.make_port_channels_with_scratch(
|
||||
self.proxy_service, self.put_buff, self.scratch, across_node_connections
|
||||
self.proxy_service, self.put_buff, self.registered_scratch, across_node_connections
|
||||
)
|
||||
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.kernel = KernelBuilder(
|
||||
|
||||
Reference in New Issue
Block a user