mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 09:17:06 +00:00
163 lines
5.1 KiB
Plaintext
163 lines
5.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Copyright (c) Microsoft Corporation.\n",
|
|
"Licensed under the MIT license.\n",
|
|
"\n",
|
|
"The following example demonstrates how to initialize the MSCCL++ library and perform necessary setup for communicating from GPU kernels. First we define a function for registering memory, making connections and creating channels."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import mscclpp\n",
|
|
"\n",
|
|
"def setup_channels(comm, memory, proxy_service):\n",
|
|
" # Register the memory with the communicator\n",
|
|
" reg_mem = comm.register_memory(memory.data.ptr, memory.nbytes, mscclpp.Transport.CudaIpc)\n",
|
|
"\n",
|
|
" # Create connections to all other ranks and exchange registered memories\n",
|
|
" connections = []\n",
|
|
" remote_memories = []\n",
|
|
" for r in range(comm.bootstrap.size):\n",
|
|
" if r == comm.bootstrap.rank: # Don't connect to self\n",
|
|
" continue\n",
|
|
" connections.append(comm.connect(r, 0, mscclpp.Transport.CudaIpc))\n",
|
|
" comm.send_memory(reg_mem, r, 0)\n",
|
|
" remote_mem = comm.recv_memory(r, 0)\n",
|
|
" remote_memories.append(remote_mem)\n",
|
|
"\n",
|
|
" # Both connections and received remote memories are returned as futures,\n",
|
|
" # so we wait for them to complete and unwrap them.\n",
|
|
" connections = [conn.get() for conn in connections]\n",
|
|
" remote_memories = [mem.get() for mem in remote_memories]\n",
|
|
"\n",
|
|
" # Finally, create proxy channels for each connection\n",
|
|
" proxy_channels = [mscclpp.SimpleProxyChannel(\n",
|
|
" proxy_service.proxy_channel(proxy_service.build_and_add_semaphore(comm, conn)),\n",
|
|
" proxy_service.add_memory(remote_memories[i]),\n",
|
|
" proxy_service.add_memory(reg_mem),\n",
|
|
" ) for i, conn in enumerate(connections)]\n",
|
|
"\n",
|
|
" return proxy_channels"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now we are ready to write the top-level code for each rank."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import cupy as cp\n",
|
|
"\n",
|
|
"def run(rank, world_size, if_ip_port_trio):\n",
|
|
" # Use the right GPU for this rank\n",
|
|
" cp.cuda.Device(rank).use()\n",
|
|
" \n",
|
|
" # Allocate memory on the GPU\n",
|
|
" memory = cp.zeros(1024, dtype=cp.int32)\n",
|
|
"\n",
|
|
" # Initialize a bootstrapper using a known interface/IP/port trio for the root rank\n",
|
|
" boot = mscclpp.TcpBootstrap.create(rank, world_size)\n",
|
|
" boot.initialize(if_ip_port_trio)\n",
|
|
"\n",
|
|
" # Create a communicator for the processes in the bootstrapper\n",
|
|
" comm = mscclpp.Communicator(boot)\n",
|
|
"\n",
|
|
" # Create a proxy service, which enables GPU kernels to use connections\n",
|
|
" proxy_service = mscclpp.ProxyService()\n",
|
|
"\n",
|
|
" if rank == 0:\n",
|
|
" print(\"Setting up channels\")\n",
|
|
" proxy_channels = setup_channels(comm, memory, proxy_service)\n",
|
|
"\n",
|
|
" if rank == 0:\n",
|
|
" print(\"Starting proxy service\")\n",
|
|
" proxy_service.start_proxy()\n",
|
|
"\n",
|
|
" # This is where we could launch a GPU kernel that uses proxy_channels[i].device_handle\n",
|
|
" # to initiate communication. See include/mscclpp/proxy_channel_device.hpp for details.\n",
|
|
" if rank == 0:\n",
|
|
" print(\"GPU kernels that use the proxy go here.\")\n",
|
|
"\n",
|
|
" if rank == 0:\n",
|
|
" print(f\"Stopping proxy service\")\n",
|
|
" proxy_service.stop_proxy()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Finally, to test the code we can run each process using the `multiprocessing` package."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Setting up channels\n",
|
|
"Starting proxy service\n",
|
|
"GPU kernels that use the proxy go here.\n",
|
|
"Stopping proxy service\n",
|
|
"\n",
|
|
"Starting proxy service\n",
|
|
"GPU kernels that use the proxy go here.\n",
|
|
"Stopping proxy service\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import multiprocessing as mp\n",
|
|
"\n",
|
|
"world_size = 2\n",
|
|
"processes = [mp.Process(target=run, args=(rank, world_size, \"eth0:localhost:50051\")) for rank in range(world_size)]\n",
|
|
"for p in processes:\n",
|
|
" p.start()\n",
|
|
"for p in processes:\n",
|
|
" p.join()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.10"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|