works without bcast

This commit is contained in:
Saeed Maleki
2023-02-06 23:04:03 +00:00
parent d8e0547639
commit 38c3bf56eb
3 changed files with 17 additions and 8 deletions

View File

@@ -185,10 +185,13 @@ mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle, bool
return mscclppSuccess;
}
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle) {
memset(handle, 0, sizeof(mscclppBootstrapHandle));
MSCCLPPCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
// #include <netinet/in.h>
// #include <arpa/inet.h>
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot) {
memset(handle, 0, sizeof(mscclppBootstrapHandle));
// MSCCLPPCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
handle->magic = 0xdeadbeef;
char* env = getenv("MSCCLPP_COMM_ID");
if (env) {
INFO(MSCCLPP_ENV, "MSCCLPP_COMM_ID set by environment to %s", env);
@@ -196,10 +199,14 @@ mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle) {
WARN("Invalid MSCCLPP_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return mscclppInvalidArgument;
}
if (isRoot)
MSCCLPPCHECK(bootstrapCreateRoot(handle, false));
} else {
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress));
MSCCLPPCHECK(bootstrapCreateRoot(handle, false));
}
// printf("addr = %s port = %d\n", inet_ntoa(handle->addr.sin.sin_addr), (int)ntohs(handle->addr.sin.sin_port));
// printf("addr = %s\n", inet_ntoa((*(struct sockaddr_in*)&handle->addr.sa).sin_addr));
return mscclppSuccess;
}

View File

@@ -11,6 +11,8 @@ int main()
int world_size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// int a;
// scanf("%d", &a);
mscclppResult_t res = bootstrapNetInit();
if (res != mscclppSuccess) {
@@ -19,15 +21,15 @@ int main()
}
mscclppBootstrapHandle handle;
if (rank == 0) {
res = bootstrapGetUniqueId(&handle);
if (true || rank == 0) {
res = bootstrapGetUniqueId(&handle, rank == 0);
if (res != mscclppSuccess) {
printf("bootstrapGetUniqueId failed\n");
return -1;
}
}
MPI_Bcast(&handle, sizeof(mscclppBootstrapHandle), MPI_BYTE, 0, MPI_COMM_WORLD);
// MPI_Bcast(&handle, sizeof(mscclppBootstrapHandle), MPI_BYTE, 0, MPI_COMM_WORLD);
mscclppComm *comm;
res = mscclppCalloc(&comm, 1);
@@ -80,6 +82,6 @@ int main()
MPI_Finalize();
printf("Succeeded!\n");
printf("Succeeded! %d\n", rank);
return 0;
}

View File

@@ -20,7 +20,7 @@ static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId),
mscclppResult_t bootstrapNetInit();
mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle, bool idFromEnv);
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle);
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true);
mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm);
mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size);
mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);