diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index c7a80611..32f6b8c9 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -150,12 +150,13 @@ void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService mscclpp::Transport transport; if (rankToNode(r) == thisNode) { transport = mscclpp::Transport::CudaIpc; + proxyService.hostEpochs.emplace_back(nullptr); } else { transport = ibTransport; + proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); } // Connect with all other ranks proxyService.connections[r] = comm.connectOnSetup(r, 0, transport); - proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); proxyService.deviceEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); comm.sendMemoryOnSetup(proxyService.localMemory, r, 0); @@ -197,6 +198,7 @@ std::unordered_map parseArgs(int argc, char* argv[]) int main(int argc, char* argv[]) { + sleep(10); MPI_Init(&argc, &argv); auto parsedArgs = parseArgs(argc, argv); @@ -258,7 +260,7 @@ int main(int argc, char* argv[]) CUDACHECK(cudaMemcpy(&deviceHandles[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + // kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); CUDACHECK(cudaStreamSynchronize(stream)); CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));