diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index 55a22386..71055fb7 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -413,6 +413,12 @@ static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) { } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno); return mscclppSystemError; + } else if (++sock->acceptRetries == RETRY_ACCEPT_TIMES) { + WARN("socketTryAccept: exceeded retries (%d)", sock->acceptRetries); + return mscclppRemoteError; + } else { + usleep(SLEEP_INT); + if (sock->acceptRetries % 1000 == 0) INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno)); } return mscclppSuccess; } @@ -691,6 +697,7 @@ mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocke if (sock == NULL) goto exit; sock->timedOutRetries = 0; sock->refusedRetries = 0; + sock->acceptRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = mscclppSocketStateInitialized; diff --git a/src/include/socket.h b/src/include/socket.h index c2d56365..5e0de72b 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -21,6 +21,7 @@ #define SLEEP_INT 1000 // connection retry sleep interval in usec #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) +#define RETRY_ACCEPT_TIMES 2e4 // connection accept retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define MSCCLPP_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL @@ -58,6 +59,7 @@ struct mscclppSocket { int acceptFd; int timedOutRetries; int refusedRetries; + int acceptRetries; union mscclppSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag;