From a4bb8fb4bf0b94310071fab6b48d747174eab733 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Apr 2026 21:30:21 +0000 Subject: [PATCH] add debugging code --- python/test/executor_test.py | 22 ++- python/test/executor_test_verifier.cu | 193 +++++++++++++++++++++++++- 2 files changed, 212 insertions(+), 3 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 59bc1661..83b2cb86 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -166,9 +166,11 @@ def build_bufs( else: input_buf = GpuBuffer(nelems_input, dtype=dtype) + in_place = False + test_buf = cp.zeros(nelems, dtype=dtype) - return input_buf, result_buf, test_buf + return input_buf, result_buf, test_buf, nelems def main( @@ -190,7 +192,7 @@ def main( collective = execution_plan.collective dtype = parse_dtype(dtype_str) - input_buf, result_buf, test_buf = build_bufs( + input_buf, result_buf, test_buf, nelem = build_bufs( collective, size, in_place, @@ -212,6 +214,22 @@ def main( ) mscclpp_group.barrier() + print("size= ", size, "nelem= ", nelem) + + # Sentinel fill: choose something unlikely in your pattern + result_buf.fill(cp.float16(123.0)) + cp.cuda.runtime.deviceSynchronize() + + # Run ONE execution (no graph), then sync + stream = cp.cuda.Stream(non_blocking=True) + with stream: + executor_func(stream) + stream.synchronize() + + # Count how many elements changed + changed = cp.count_nonzero(result_buf != cp.float16(123.0)).item() + print("changed elements:", changed, "out of", result_buf.size) + bench_correctness( collective, input_buf, diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu index cf3cd4a6..5c96a922 100644 --- a/python/test/executor_test_verifier.cu +++ b/python/test/executor_test_verifier.cu @@ -120,4 +120,195 @@ TEST_DATA_REDUCE_SCATTER(int32, int) TEST_DATA_ALL_TO_ALL(float16, __half) TEST_DATA_ALL_TO_ALL(float32, float) -TEST_DATA_ALL_TO_ALL(int32, int) \ No newline at end of file +TEST_DATA_ALL_TO_ALL(int32, int) + +/*#define TEST_DATA_SENDRECV(FuncNameType, DataType) \ + extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType( \ + DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ + \ + /* Ring semantics: receive from prev rank */ \ +/* int peer_rank = (my_rank - 1 + num_ranks) % num_ranks; \ + \ + unsigned int seed = \ + (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq); \ + \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < num_elems; \ + i += blockDim.x * gridDim.x) { \ + seed = ranqd1(seed); \ + test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x); \ + \ + /* Optional: print first few mismatches */ \ +/* if (result_buf[i] != test_buf[i] && blockIdx.x == 0 && threadIdx.x == 0 && i < 8) { \ + printf("MISMATCH rank=%d peer=%d i=%zu result=%f expected=%f\n", \ + my_rank, peer_rank, i, (float)result_buf[i], (float)test_buf[i]); \ + } \ + \ + assert(result_buf[i] == test_buf[i]); \ + } \ + }*/ + + +/*#define TEST_DATA_SENDRECV(FuncNameType, DataType) \ + extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType( \ + DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ + \ + int prev_rank = (my_rank - 1 + num_ranks) % num_ranks; \ + int next_rank = (my_rank + 1) % num_ranks; \ + int self_rank = my_rank; \ + \ + unsigned int seed_prev = \ + (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq); \ + unsigned int seed_next = \ + (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq); \ + unsigned int seed_self = \ + (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + self_rank + seq); \ + \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < num_elems; \ + i += blockDim.x * gridDim.x) { \ + \ + seed_prev = ranqd1(seed_prev); \ + seed_next = ranqd1(seed_next); \ + seed_self = ranqd1(seed_self); \ + \ + DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x); \ + DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x); \ + DataType exp_self = DataType(seed_self % blockDim.x) / DataType(blockDim.x); \ + \ + /* For compatibility: avoid %zu formatting quirks on device */ \ +/* unsigned long long ii = (unsigned long long)i; \ + \ + if (result_buf[i] != exp_prev) { \ + /* Print only a few mismatches to avoid flooding */ \ +/* if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 192) && ii < 256ULL) { \ + printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f exp_prev(from %d)=%f " \ + "exp_next(from %d)=%f exp_self(from %d)=%f\n", \ + my_rank, num_ranks, ii, \ + (float)result_buf[i], \ + prev_rank, (float)exp_prev, \ + next_rank, (float)exp_next, \ + self_rank, (float)exp_self); \ + } \ + } \ + \ + test_buf[i] = exp_prev; \ + assert(result_buf[i] == test_buf[i]); \ + } \ + } +*/ + + +#define TEST_DATA_SENDRECV(FuncNameType, DataType) \ + extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType( \ + DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ + \ + /* Expected ring semantics (if your algorithm is ring-prev). */ \ + int prev_rank = (my_rank - 1 + num_ranks) % num_ranks; \ + int next_rank = (my_rank + 1) % num_ranks; \ + int self_rank = my_rank; \ + \ + /* Thread identity and stride must match fill_data_* generation pattern. */ \ + const unsigned long long tid = \ + (unsigned long long)(blockIdx.x * blockDim.x + threadIdx.x); \ + const unsigned long long stride = \ + (unsigned long long)(blockDim.x * gridDim.x); \ + \ + for (unsigned long long i = tid; i < (unsigned long long)num_elems; i += stride) { \ + \ + /* Compute how many iterations this thread advanced before reaching i. */ \ + unsigned long long k = (i - tid) / stride; \ + \ + /* Helper lambda: compute expected value for a given sender rank r at element i for this thread. */ \ + auto expected_for_rank = [&](int r) -> DataType { \ + unsigned int s = (unsigned int)(tid + (unsigned long long)r + (unsigned long long)seq); \ + /* fill_data does: seed=ranqd1(seed) once per element visited. \ + For the k-th visited element, apply ranqd1 (k+1) times. */ \ + for (unsigned long long step = 0; step < k + 1; ++step) { \ + s = ranqd1(s); \ + } \ + return DataType(s % blockDim.x) / DataType(blockDim.x); \ + }; \ + \ + DataType exp_prev = expected_for_rank(prev_rank); \ + DataType exp_next = expected_for_rank(next_rank); \ + DataType exp_self = expected_for_rank(self_rank); \ + \ + /* Store expected(prev) in test_buf for the assert (keeps compatibility with your current check). */ \ + test_buf[i] = exp_prev; \ + \ + if (result_buf[i] != test_buf[i]) { \ + /* Try to identify which rank's stream matches the observed result. */ \ + int matched = -1; \ + for (int r = 0; r < num_ranks; ++r) { \ + DataType exp_r = expected_for_rank(r); \ + if (result_buf[i] == exp_r) { \ + matched = r; \ + break; \ + } \ + } \ + \ + /* Print only a small number of mismatches to avoid log spam. */ \ + if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 160) && i < 256ULL) { \ + printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f " \ + "exp_prev(from %d)=%f exp_next(from %d)=%f exp_self(from %d)=%f matched_sender=%d\n", \ + my_rank, num_ranks, i, \ + (float)result_buf[i], \ + prev_rank, (float)exp_prev, \ + next_rank, (float)exp_next, \ + self_rank, (float)exp_self, \ + matched); \ + } \ + \ + assert(result_buf[i] == test_buf[i]); \ + } \ + } \ + } + + +/* +#define TEST_DATA_SENDRECV(FuncNameType, DataType) \ +extern "C" __global__ void __launch_bounds__(1024, 1) \ +test_data_sendrecv_##FuncNameType( \ + DataType* result_buf, \ + DataType* test_buf, \ + size_t num_elems, \ + int num_ranks, \ + int my_rank, \ + int seq) { \ + \ + int prev_rank = (my_rank - 1 + num_ranks) % num_ranks; \ + int next_rank = (my_rank + 1) % num_ranks; \ + \ + unsigned int seed_prev = \ + (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq); \ + unsigned int seed_next = \ + (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq); \ + \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < num_elems; \ + i += blockDim.x * gridDim.x) { \ + \ + seed_prev = ranqd1(seed_prev); \ + seed_next = ranqd1(seed_next); \ + \ + DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x); \ + DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x); \ + \ + if (result_buf[i] != exp_prev) { \ + if (blockIdx.x == 0 && threadIdx.x == 0 && i < 8) { \ + printf("***rank=%d i=%zu result=%f prev(from %d)=%f next(from %d)=%f\n", \ + my_rank, i, (float)result_buf[i], \ + prev_rank, (float)exp_prev, \ + next_rank, (float)exp_next); \ + } \ + } \ + \ + test_buf[i] = exp_prev; \ + assert(result_buf[i] == test_buf[i]); \ + } \ +} +*/ +TEST_DATA_SENDRECV(float16, __half) +TEST_DATA_SENDRECV(float32, float) +TEST_DATA_SENDRECV(int32, int)