cudagraph now works with p2p proxy

2026-05-24 14:54:51 +00:00 · 2023-02-28 23:25:51 +00:00
parent 3d051a985f
commit ac1bf6dc52
2 changed files with 35 additions and 29 deletions
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -53,6 +53,8 @@ void* mscclppProxyServiceP2P(void* _args) {
  PROXYCUDACHECK(cudaSetDevice(comm->cudaDev));
  PROXYCUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  // TODO(chhwang): find numa node for this gpu
+  cudaStreamCaptureStatus stat;
+  cudaStreamIsCapturing(stream, &stat);

  while (*run) {
    // Try send
--- a/tests/p2p_test.cu
+++ b/tests/p2p_test.cu
@@ -7,6 +7,7 @@
 #include <unistd.h>
 #include <string>

+#define RANKS_PER_NODE 8
 #define USE_DMA_FOR_P2P 1
 #define TEST_CONN_TYPE 0 // 0: P2P(for local)+IB(for remote), 1: IB-Only

@@ -242,39 +243,42 @@ int main(int argc, const char *argv[])

  // warm up
  int warmupiter = 1000;
-  for (int i = 0; i < warmupiter; ++i) {
-    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+//  for (int i = 0; i < warmupiter; ++i) {
+//    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+//  }
+
+  // cudaGraph Capture
+  cudaGraph_t graph;
+  cudaGraphExec_t instance;
+  cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+  int cudagraphiter = 100;
+  for (int i = 0; i < cudagraphiter; ++i) {
+  	kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
  }
+  cudaStreamEndCapture(stream, &graph);
+  cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);

-  // // cudaGraph Capture
-  // cudaGraph_t graph;
-  // cudaGraphExec_t instance;
-  // cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
-  // int cudagraphiter = 100;
-  // for (int i = 0; i < cudagraphiter; ++i) {
-  //   kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
-  // }
-  // cudaStreamEndCapture(stream, &graph);
-  // cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
-
-  // int cudagraphwarmup = 200;
-  // for (int i = 0; i < cudagraphwarmup; ++i) {
-  //   cudaGraphLaunch(instance, stream);
-  // }
-
-  // // measure runtime 
-  // CUDACHECK(cudaEventRecord(ev_start, stream));
-  // int cudagraphlaunch = 1000;
-  // for (int i = 0; i < cudagraphlaunch; ++i) {
-  //   // kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
-  //   cudaGraphLaunch(instance, stream);
-  // }
-  // CUDACHECK(cudaEventRecord(ev_end, stream));
+  int cudagraphwarmup = 200;
+  for (int i = 0; i < cudagraphwarmup; ++i) {
+	  cudaGraphLaunch(instance, stream);
+  }
  CUDACHECK(cudaStreamSynchronize(stream));

-  // float ms;
-  // CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
-  // printf("rank: %d, time: %f us/iter\n", rank, ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter);
+  // measure runtime 
+//  CUDACHECK(cudaEventRecord(ev_start, stream));
+  double t0 = MPI_Wtime();
+  int cudagraphlaunch = 1000;
+  for (int i = 0; i < cudagraphlaunch; ++i) {
+  // kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+     cudaGraphLaunch(instance, stream);
+  }
+//  CUDACHECK(cudaEventRecord(ev_end, stream));
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  double t1 = MPI_Wtime();
+  float ms = (t1-t0)*1000.0;
+//  CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
+  printf("rank: %d, time: %f us/iter\n", rank, ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter);

  MSCCLPPCHECK(mscclppProxyStop(comm));