From a4bb8fb4bf0b94310071fab6b48d747174eab733 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Fri, 3 Apr 2026 21:30:21 +0000
Subject: [PATCH] add debugging code

---
 python/test/executor_test.py          |  22 ++-
 python/test/executor_test_verifier.cu | 193 +++++++++++++++++++++++++-
 2 files changed, 212 insertions(+), 3 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 59bc1661..83b2cb86 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -166,9 +166,11 @@ def build_bufs(
     else:
         input_buf = GpuBuffer(nelems_input, dtype=dtype)
 
+    in_place = False
+
     test_buf = cp.zeros(nelems, dtype=dtype)
 
-    return input_buf, result_buf, test_buf
+    return input_buf, result_buf, test_buf, nelems
 
 
 def main(
@@ -190,7 +192,7 @@ def main(
     collective = execution_plan.collective
 
     dtype = parse_dtype(dtype_str)
-    input_buf, result_buf, test_buf = build_bufs(
+    input_buf, result_buf, test_buf, nelem = build_bufs(
         collective,
         size,
         in_place,
@@ -212,6 +214,22 @@ def main(
     )
 
     mscclpp_group.barrier()
+    print("size= ", size, "nelem= ", nelem)
+
+    # Sentinel fill: choose something unlikely in your pattern
+    result_buf.fill(cp.float16(123.0))
+    cp.cuda.runtime.deviceSynchronize()
+
+    # Run ONE execution (no graph), then sync
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        executor_func(stream)
+    stream.synchronize()
+
+    # Count how many elements changed
+    changed = cp.count_nonzero(result_buf != cp.float16(123.0)).item()
+    print("changed elements:", changed, "out of", result_buf.size)
+
     bench_correctness(
         collective,
         input_buf,
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index cf3cd4a6..5c96a922 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -120,4 +120,195 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
 
 TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
-TEST_DATA_ALL_TO_ALL(int32, int)
\ No newline at end of file
+TEST_DATA_ALL_TO_ALL(int32, int)
+
+/*#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                          \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                  \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {    \
+                                                                                                             \
+    /* Ring semantics: receive from prev rank */                                                             \
+/*    int peer_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                   \
+                                                                                                             \
+    unsigned int seed =                                                                                      \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq);                             \
+                                                                                                             \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                                                   \
+         i < num_elems;                                                                                      \
+         i += blockDim.x * gridDim.x) {                                                                      \
+      seed = ranqd1(seed);                                                                                   \
+      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                      \
+                                                                                                             \
+      /* Optional: print first few mismatches */                                                             \
+/*      if (result_buf[i] != test_buf[i] && blockIdx.x == 0 && threadIdx.x == 0 && i < 8) {                    \
+        printf("MISMATCH rank=%d peer=%d i=%zu result=%f expected=%f\n",                                     \
+               my_rank, peer_rank, i, (float)result_buf[i], (float)test_buf[i]);                             \
+      }                                                                                                      \
+                                                                                                             \
+      assert(result_buf[i] == test_buf[i]);                                                                  \
+    }                                                                                                        \
+  }*/
+
+
+/*#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                        \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
+                                                                                                           \
+    int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                 \
+    int next_rank = (my_rank + 1) % num_ranks;                                                             \
+    int self_rank = my_rank;                                                                               \
+                                                                                                           \
+    unsigned int seed_prev =                                                                               \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);                           \
+    unsigned int seed_next =                                                                               \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq);                           \
+    unsigned int seed_self =                                                                               \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + self_rank + seq);                           \
+                                                                                                           \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                                                 \
+         i < num_elems;                                                                                    \
+         i += blockDim.x * gridDim.x) {                                                                    \
+                                                                                                           \
+      seed_prev = ranqd1(seed_prev);                                                                       \
+      seed_next = ranqd1(seed_next);                                                                       \
+      seed_self = ranqd1(seed_self);                                                                       \
+                                                                                                           \
+      DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x);                         \
+      DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x);                         \
+      DataType exp_self = DataType(seed_self % blockDim.x) / DataType(blockDim.x);                         \
+                                                                                                           \
+      /* For compatibility: avoid %zu formatting quirks on device */                                        \
+/*      unsigned long long ii = (unsigned long long)i;                                                       \
+                                                                                                           \
+      if (result_buf[i] != exp_prev) {                                                                     \
+        /* Print only a few mismatches to avoid flooding */                                                 \
+/*        if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 192) && ii < 256ULL) {                  \
+          printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f exp_prev(from %d)=%f "              \
+                 "exp_next(from %d)=%f exp_self(from %d)=%f\n",                                            \
+                 my_rank, num_ranks, ii,                                                                   \
+                 (float)result_buf[i],                                                                     \
+                 prev_rank, (float)exp_prev,                                                               \
+                 next_rank, (float)exp_next,                                                               \
+                 self_rank, (float)exp_self);                                                              \
+        }                                                                                                  \
+      }                                                                                                    \
+                                                                                                           \
+      test_buf[i] = exp_prev;                                                                              \
+      assert(result_buf[i] == test_buf[i]);                                                                \
+    }                                                                                                      \
+  }
+*/
+
+
+#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                        \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
+                                                                                                           \
+    /* Expected ring semantics (if your algorithm is ring-prev). */                                        \
+    int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                 \
+    int next_rank = (my_rank + 1) % num_ranks;                                                             \
+    int self_rank = my_rank;                                                                               \
+                                                                                                           \
+    /* Thread identity and stride must match fill_data_* generation pattern. */                            \
+    const unsigned long long tid =                                                                        \
+        (unsigned long long)(blockIdx.x * blockDim.x + threadIdx.x);                                       \
+    const unsigned long long stride =                                                                      \
+        (unsigned long long)(blockDim.x * gridDim.x);                                                      \
+                                                                                                           \
+    for (unsigned long long i = tid; i < (unsigned long long)num_elems; i += stride) {                    \
+                                                                                                           \
+      /* Compute how many iterations this thread advanced before reaching i. */                            \
+      unsigned long long k = (i - tid) / stride;                                                           \
+                                                                                                           \
+      /* Helper lambda: compute expected value for a given sender rank r at element i for this thread. */  \
+      auto expected_for_rank = [&](int r) -> DataType {                                                    \
+        unsigned int s = (unsigned int)(tid + (unsigned long long)r + (unsigned long long)seq);            \
+        /* fill_data does: seed=ranqd1(seed) once per element visited.                                     \
+           For the k-th visited element, apply ranqd1 (k+1) times. */                                      \
+        for (unsigned long long step = 0; step < k + 1; ++step) {                                          \
+          s = ranqd1(s);                                                                                   \
+        }                                                                                                  \
+        return DataType(s % blockDim.x) / DataType(blockDim.x);                                            \
+      };                                                                                                   \
+                                                                                                           \
+      DataType exp_prev = expected_for_rank(prev_rank);                                                    \
+      DataType exp_next = expected_for_rank(next_rank);                                                    \
+      DataType exp_self = expected_for_rank(self_rank);                                                    \
+                                                                                                           \
+      /* Store expected(prev) in test_buf for the assert (keeps compatibility with your current check). */ \
+      test_buf[i] = exp_prev;                                                                              \
+                                                                                                           \
+      if (result_buf[i] != test_buf[i]) {                                                                  \
+        /* Try to identify which rank's stream matches the observed result. */                             \
+        int matched = -1;                                                                                  \
+        for (int r = 0; r < num_ranks; ++r) {                                                              \
+          DataType exp_r = expected_for_rank(r);                                                           \
+          if (result_buf[i] == exp_r) {                                                                    \
+            matched = r;                                                                                   \
+            break;                                                                                          \
+          }                                                                                                \
+        }                                                                                                  \
+                                                                                                           \
+        /* Print only a small number of mismatches to avoid log spam. */                                   \
+        if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 160) && i < 256ULL) {                   \
+          printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f "                                   \
+                 "exp_prev(from %d)=%f exp_next(from %d)=%f exp_self(from %d)=%f matched_sender=%d\n",     \
+                 my_rank, num_ranks, i,                                                                    \
+                 (float)result_buf[i],                                                                     \
+                 prev_rank, (float)exp_prev,                                                               \
+                 next_rank, (float)exp_next,                                                               \
+                 self_rank, (float)exp_self,                                                               \
+                 matched);                                                                                 \
+        }                                                                                                  \
+                                                                                                           \
+        assert(result_buf[i] == test_buf[i]);                                                              \
+      }                                                                                                    \
+    }                                                                                                      \
+  }
+
+
+/*
+#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                      \
+extern "C" __global__ void __launch_bounds__(1024, 1)                                  \
+test_data_sendrecv_##FuncNameType(                                                     \
+    DataType* result_buf,                                                              \
+    DataType* test_buf,                                                                \
+    size_t num_elems,                                                                  \
+    int num_ranks,                                                                     \
+    int my_rank,                                                                       \
+    int seq) {                                                                         \
+                                                                                       \
+  int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                               \
+  int next_rank = (my_rank + 1) % num_ranks;                                           \
+                                                                                       \
+  unsigned int seed_prev =                                                             \
+      (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);         \
+  unsigned int seed_next =                                                             \
+      (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq);         \
+                                                                                       \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                               \
+       i < num_elems;                                                                  \
+       i += blockDim.x * gridDim.x) {                                                   \
+                                                                                       \
+    seed_prev = ranqd1(seed_prev);                                                     \
+    seed_next = ranqd1(seed_next);                                                     \
+                                                                                       \
+    DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x);       \
+    DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x);       \
+                                                                                       \
+    if (result_buf[i] != exp_prev) {                                                   \
+      if (blockIdx.x == 0 && threadIdx.x == 0 && i < 8) {                              \
+        printf("***rank=%d i=%zu result=%f prev(from %d)=%f next(from %d)=%f\n",          \
+               my_rank, i, (float)result_buf[i],                                      \
+               prev_rank, (float)exp_prev,                                            \
+               next_rank, (float)exp_next);                                           \
+      }                                                                                \
+    }                                                                                  \
+                                                                                       \
+    test_buf[i] = exp_prev;                                                           \
+    assert(result_buf[i] == test_buf[i]);                                              \
+  }                                                                                    \
+}
+*/
+TEST_DATA_SENDRECV(float16, __half)
+TEST_DATA_SENDRECV(float32, float)
+TEST_DATA_SENDRECV(int32, int)