From 7a4c27778f0a267b37f07a342a6b3e7aa0871717 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 24 Mar 2023 08:29:00 +0000 Subject: [PATCH 1/3] 30 sec timeout for socket accept --- src/bootstrap/socket.cc | 8 ++++++++ src/include/utils.h | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index 55a22386..cacaa141 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -12,6 +12,8 @@ #include #include +#define MSCCLPP_SOCKET_ACCEPT_TIMEOUT 30 + static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; *closed = 0; @@ -406,13 +408,19 @@ mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSo } static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) { + static double initTime = -1; + if (initTime == -1) initTime = clockSec(); socklen_t socklen = sizeof(union mscclppSocketAddress); sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen); if (sock->fd != -1) { sock->state = mscclppSocketStateAccepted; + initTime = -1; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno); return mscclppSystemError; + } else if (clockSec() - initTime > MSCCLPP_SOCKET_ACCEPT_TIMEOUT) { + WARN("socketTryAccept: exceeded timeout (%d sec)", MSCCLPP_SOCKET_ACCEPT_TIMEOUT); + return mscclppRemoteError; } return mscclppSuccess; } diff --git a/src/include/utils.h b/src/include/utils.h index 1cde0721..64fbf0bc 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -49,6 +49,12 @@ inline uint64_t clockNano() { return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; } +inline double clockSec() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return double(ts.tv_sec) + (double)ts.tv_nsec * 1e-9; +} + /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else * return -1 */ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) { From 35b8ebaf64e4ffef6f9ba22eba8c8c086637bfef Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 24 Mar 2023 19:42:00 +0000 Subject: [PATCH 2/3] retry for almost 20 seconds --- src/bootstrap/socket.cc | 13 ++++++------- src/include/socket.h | 2 ++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index cacaa141..71055fb7 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -12,8 +12,6 @@ #include #include -#define MSCCLPP_SOCKET_ACCEPT_TIMEOUT 30 - static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; *closed = 0; @@ -408,19 +406,19 @@ mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSo } static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) { - static double initTime = -1; - if (initTime == -1) initTime = clockSec(); socklen_t socklen = sizeof(union mscclppSocketAddress); sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen); if (sock->fd != -1) { sock->state = mscclppSocketStateAccepted; - initTime = -1; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno); return mscclppSystemError; - } else if (clockSec() - initTime > MSCCLPP_SOCKET_ACCEPT_TIMEOUT) { - WARN("socketTryAccept: exceeded timeout (%d sec)", MSCCLPP_SOCKET_ACCEPT_TIMEOUT); + } else if (++sock->acceptRetries == RETRY_ACCEPT_TIMES) { + WARN("socketTryAccept: exceeded retries (%d)", sock->acceptRetries); return mscclppRemoteError; + } else { + usleep(SLEEP_INT); + if (sock->acceptRetries % 1000 == 0) INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno)); } return mscclppSuccess; } @@ -699,6 +697,7 @@ mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocke if (sock == NULL) goto exit; sock->timedOutRetries = 0; sock->refusedRetries = 0; + sock->acceptRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = mscclppSocketStateInitialized; diff --git a/src/include/socket.h b/src/include/socket.h index 53c93036..75eb1b41 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -20,6 +20,7 @@ #define SLEEP_INT 1000 // connection retry sleep interval in usec #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) +#define RETRY_ACCEPT_TIMES 2e4 // connection accept retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define MSCCLPP_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL @@ -57,6 +58,7 @@ struct mscclppSocket { int acceptFd; int timedOutRetries; int refusedRetries; + int acceptRetries; union mscclppSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag; From b07508b8f35924ce477425f34b198064c52b90d1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 24 Mar 2023 19:43:41 +0000 Subject: [PATCH 3/3] removed clockSec since it is not used --- src/include/utils.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/include/utils.h b/src/include/utils.h index 64fbf0bc..1cde0721 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -49,12 +49,6 @@ inline uint64_t clockNano() { return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; } -inline double clockSec() { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return double(ts.tv_sec) + (double)ts.tv_nsec * 1e-9; -} - /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else * return -1 */ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {