mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 19:57:52 +00:00
62 lines
1.7 KiB
Python
Executable File
62 lines
1.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Simple RCCL test for multi-GPU communication.
|
|
This test verifies that RCCL can initialize and communicate across multiple GPUs.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
import torch
|
|
import torch.distributed as dist
|
|
|
|
|
|
def test_rccl_allreduce():
|
|
"""Test basic RCCL allreduce operation across all GPUs."""
|
|
if not torch.cuda.is_available():
|
|
print("CUDA not available, skipping test")
|
|
sys.exit(1)
|
|
|
|
# Initialize process group with NCCL (RCCL on AMD)
|
|
dist.init_process_group(backend="nccl")
|
|
|
|
rank = dist.get_rank()
|
|
world_size = dist.get_world_size()
|
|
|
|
print(f"[Rank {rank}/{world_size}] Initialized successfully")
|
|
|
|
# Set device
|
|
device = torch.device(f"cuda:{rank}")
|
|
torch.cuda.set_device(device)
|
|
|
|
print(f"[Rank {rank}] Device: {torch.cuda.get_device_name(device)}")
|
|
print(
|
|
f"[Rank {rank}] Device memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB"
|
|
)
|
|
|
|
# Create a tensor and perform allreduce
|
|
tensor = torch.ones(1000, device=device) * rank
|
|
print(f"[Rank {rank}] Before allreduce: tensor sum = {tensor.sum().item()}")
|
|
|
|
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
|
|
|
expected_sum = sum(range(world_size)) * 1000
|
|
actual_sum = tensor.sum().item()
|
|
|
|
print(
|
|
f"[Rank {rank}] After allreduce: tensor sum = {actual_sum}, expected = {expected_sum}"
|
|
)
|
|
|
|
if abs(actual_sum - expected_sum) < 0.1:
|
|
print(f"[Rank {rank}] ✓ RCCL allreduce test PASSED")
|
|
dist.destroy_process_group()
|
|
sys.exit(0)
|
|
else:
|
|
print(f"[Rank {rank}] ✗ RCCL allreduce test FAILED")
|
|
dist.destroy_process_group()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_rccl_allreduce()
|