From 6ad82e8bbea263f272df92f0fa304cadada5a65e Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Wed, 29 Apr 2026 20:44:37 +0000 Subject: [PATCH] tests/ep: disable NCCL HeartbeatMonitor to silence mpirun shutdown noise Set TORCH_NCCL_ENABLE_MONITORING=0 before importing torch.distributed. The barrier+destroy_process_group finally block (afbdcd6a) suffices under torchrun, but under mpirun rank 0 (the TCPStore server) can exit before non-zero ranks finish teardown, and the background heartbeat thread polls the store and logs 'recvValue failed / Connection was likely closed'. Disabling the monitor outright is safe for short-lived bench runs. --- test/python/ext/ep/test_low_latency_multirank.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/python/ext/ep/test_low_latency_multirank.py b/test/python/ext/ep/test_low_latency_multirank.py index c48eca90..f6206136 100644 --- a/test/python/ext/ep/test_low_latency_multirank.py +++ b/test/python/ext/ep/test_low_latency_multirank.py @@ -35,6 +35,12 @@ import os import random import sys +# Disable ProcessGroupNCCL's HeartbeatMonitor before importing torch.distributed. +# It runs in a background thread polling the TCPStore; under mpirun, rank 0 +# (the store server) can exit before non-zero ranks finish teardown, producing +# noisy 'recvValue failed / Connection was likely closed' stack traces. +os.environ.setdefault("TORCH_NCCL_ENABLE_MONITORING", "0") + import torch import torch.distributed as dist