diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index cacaa141..71055fb7 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -12,8 +12,6 @@ #include #include -#define MSCCLPP_SOCKET_ACCEPT_TIMEOUT 30 - static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; *closed = 0; @@ -408,19 +406,19 @@ mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSo } static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) { - static double initTime = -1; - if (initTime == -1) initTime = clockSec(); socklen_t socklen = sizeof(union mscclppSocketAddress); sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen); if (sock->fd != -1) { sock->state = mscclppSocketStateAccepted; - initTime = -1; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno); return mscclppSystemError; - } else if (clockSec() - initTime > MSCCLPP_SOCKET_ACCEPT_TIMEOUT) { - WARN("socketTryAccept: exceeded timeout (%d sec)", MSCCLPP_SOCKET_ACCEPT_TIMEOUT); + } else if (++sock->acceptRetries == RETRY_ACCEPT_TIMES) { + WARN("socketTryAccept: exceeded retries (%d)", sock->acceptRetries); return mscclppRemoteError; + } else { + usleep(SLEEP_INT); + if (sock->acceptRetries % 1000 == 0) INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno)); } return mscclppSuccess; } @@ -699,6 +697,7 @@ mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocke if (sock == NULL) goto exit; sock->timedOutRetries = 0; sock->refusedRetries = 0; + sock->acceptRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = mscclppSocketStateInitialized; diff --git a/src/include/socket.h b/src/include/socket.h index 53c93036..75eb1b41 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -20,6 +20,7 @@ #define SLEEP_INT 1000 // connection retry sleep interval in usec #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) +#define RETRY_ACCEPT_TIMES 2e4 // connection accept retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define MSCCLPP_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL @@ -57,6 +58,7 @@ struct mscclppSocket { int acceptFd; int timedOutRetries; int refusedRetries; + int acceptRetries; union mscclppSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag;