diff --git a/test/python/ext/ep/test_low_latency_multirank.py b/test/python/ext/ep/test_low_latency_multirank.py index c48eca90..f6206136 100644 --- a/test/python/ext/ep/test_low_latency_multirank.py +++ b/test/python/ext/ep/test_low_latency_multirank.py @@ -35,6 +35,12 @@ import os import random import sys +# Disable ProcessGroupNCCL's HeartbeatMonitor before importing torch.distributed. +# It runs in a background thread polling the TCPStore; under mpirun, rank 0 +# (the store server) can exit before non-zero ranks finish teardown, producing +# noisy 'recvValue failed / Connection was likely closed' stack traces. +os.environ.setdefault("TORCH_NCCL_ENABLE_MONITORING", "0") + import torch import torch.distributed as dist