mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
add debugging code
This commit is contained in:
@@ -166,9 +166,11 @@ def build_bufs(
|
||||
else:
|
||||
input_buf = GpuBuffer(nelems_input, dtype=dtype)
|
||||
|
||||
in_place = False
|
||||
|
||||
test_buf = cp.zeros(nelems, dtype=dtype)
|
||||
|
||||
return input_buf, result_buf, test_buf
|
||||
return input_buf, result_buf, test_buf, nelems
|
||||
|
||||
|
||||
def main(
|
||||
@@ -190,7 +192,7 @@ def main(
|
||||
collective = execution_plan.collective
|
||||
|
||||
dtype = parse_dtype(dtype_str)
|
||||
input_buf, result_buf, test_buf = build_bufs(
|
||||
input_buf, result_buf, test_buf, nelem = build_bufs(
|
||||
collective,
|
||||
size,
|
||||
in_place,
|
||||
@@ -212,6 +214,22 @@ def main(
|
||||
)
|
||||
|
||||
mscclpp_group.barrier()
|
||||
print("size= ", size, "nelem= ", nelem)
|
||||
|
||||
# Sentinel fill: choose something unlikely in your pattern
|
||||
result_buf.fill(cp.float16(123.0))
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
|
||||
# Run ONE execution (no graph), then sync
|
||||
stream = cp.cuda.Stream(non_blocking=True)
|
||||
with stream:
|
||||
executor_func(stream)
|
||||
stream.synchronize()
|
||||
|
||||
# Count how many elements changed
|
||||
changed = cp.count_nonzero(result_buf != cp.float16(123.0)).item()
|
||||
print("changed elements:", changed, "out of", result_buf.size)
|
||||
|
||||
bench_correctness(
|
||||
collective,
|
||||
input_buf,
|
||||
|
||||
@@ -120,4 +120,195 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
|
||||
|
||||
TEST_DATA_ALL_TO_ALL(float16, __half)
|
||||
TEST_DATA_ALL_TO_ALL(float32, float)
|
||||
TEST_DATA_ALL_TO_ALL(int32, int)
|
||||
TEST_DATA_ALL_TO_ALL(int32, int)
|
||||
|
||||
/*#define TEST_DATA_SENDRECV(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
\
|
||||
/* Ring semantics: receive from prev rank */ \
|
||||
/* int peer_rank = (my_rank - 1 + num_ranks) % num_ranks; \
|
||||
\
|
||||
unsigned int seed = \
|
||||
(unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq); \
|
||||
\
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||
i < num_elems; \
|
||||
i += blockDim.x * gridDim.x) { \
|
||||
seed = ranqd1(seed); \
|
||||
test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x); \
|
||||
\
|
||||
/* Optional: print first few mismatches */ \
|
||||
/* if (result_buf[i] != test_buf[i] && blockIdx.x == 0 && threadIdx.x == 0 && i < 8) { \
|
||||
printf("MISMATCH rank=%d peer=%d i=%zu result=%f expected=%f\n", \
|
||||
my_rank, peer_rank, i, (float)result_buf[i], (float)test_buf[i]); \
|
||||
} \
|
||||
\
|
||||
assert(result_buf[i] == test_buf[i]); \
|
||||
} \
|
||||
}*/
|
||||
|
||||
|
||||
/*#define TEST_DATA_SENDRECV(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
\
|
||||
int prev_rank = (my_rank - 1 + num_ranks) % num_ranks; \
|
||||
int next_rank = (my_rank + 1) % num_ranks; \
|
||||
int self_rank = my_rank; \
|
||||
\
|
||||
unsigned int seed_prev = \
|
||||
(unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq); \
|
||||
unsigned int seed_next = \
|
||||
(unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq); \
|
||||
unsigned int seed_self = \
|
||||
(unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + self_rank + seq); \
|
||||
\
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||
i < num_elems; \
|
||||
i += blockDim.x * gridDim.x) { \
|
||||
\
|
||||
seed_prev = ranqd1(seed_prev); \
|
||||
seed_next = ranqd1(seed_next); \
|
||||
seed_self = ranqd1(seed_self); \
|
||||
\
|
||||
DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x); \
|
||||
DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x); \
|
||||
DataType exp_self = DataType(seed_self % blockDim.x) / DataType(blockDim.x); \
|
||||
\
|
||||
/* For compatibility: avoid %zu formatting quirks on device */ \
|
||||
/* unsigned long long ii = (unsigned long long)i; \
|
||||
\
|
||||
if (result_buf[i] != exp_prev) { \
|
||||
/* Print only a few mismatches to avoid flooding */ \
|
||||
/* if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 192) && ii < 256ULL) { \
|
||||
printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f exp_prev(from %d)=%f " \
|
||||
"exp_next(from %d)=%f exp_self(from %d)=%f\n", \
|
||||
my_rank, num_ranks, ii, \
|
||||
(float)result_buf[i], \
|
||||
prev_rank, (float)exp_prev, \
|
||||
next_rank, (float)exp_next, \
|
||||
self_rank, (float)exp_self); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
test_buf[i] = exp_prev; \
|
||||
assert(result_buf[i] == test_buf[i]); \
|
||||
} \
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#define TEST_DATA_SENDRECV(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
\
|
||||
/* Expected ring semantics (if your algorithm is ring-prev). */ \
|
||||
int prev_rank = (my_rank - 1 + num_ranks) % num_ranks; \
|
||||
int next_rank = (my_rank + 1) % num_ranks; \
|
||||
int self_rank = my_rank; \
|
||||
\
|
||||
/* Thread identity and stride must match fill_data_* generation pattern. */ \
|
||||
const unsigned long long tid = \
|
||||
(unsigned long long)(blockIdx.x * blockDim.x + threadIdx.x); \
|
||||
const unsigned long long stride = \
|
||||
(unsigned long long)(blockDim.x * gridDim.x); \
|
||||
\
|
||||
for (unsigned long long i = tid; i < (unsigned long long)num_elems; i += stride) { \
|
||||
\
|
||||
/* Compute how many iterations this thread advanced before reaching i. */ \
|
||||
unsigned long long k = (i - tid) / stride; \
|
||||
\
|
||||
/* Helper lambda: compute expected value for a given sender rank r at element i for this thread. */ \
|
||||
auto expected_for_rank = [&](int r) -> DataType { \
|
||||
unsigned int s = (unsigned int)(tid + (unsigned long long)r + (unsigned long long)seq); \
|
||||
/* fill_data does: seed=ranqd1(seed) once per element visited. \
|
||||
For the k-th visited element, apply ranqd1 (k+1) times. */ \
|
||||
for (unsigned long long step = 0; step < k + 1; ++step) { \
|
||||
s = ranqd1(s); \
|
||||
} \
|
||||
return DataType(s % blockDim.x) / DataType(blockDim.x); \
|
||||
}; \
|
||||
\
|
||||
DataType exp_prev = expected_for_rank(prev_rank); \
|
||||
DataType exp_next = expected_for_rank(next_rank); \
|
||||
DataType exp_self = expected_for_rank(self_rank); \
|
||||
\
|
||||
/* Store expected(prev) in test_buf for the assert (keeps compatibility with your current check). */ \
|
||||
test_buf[i] = exp_prev; \
|
||||
\
|
||||
if (result_buf[i] != test_buf[i]) { \
|
||||
/* Try to identify which rank's stream matches the observed result. */ \
|
||||
int matched = -1; \
|
||||
for (int r = 0; r < num_ranks; ++r) { \
|
||||
DataType exp_r = expected_for_rank(r); \
|
||||
if (result_buf[i] == exp_r) { \
|
||||
matched = r; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Print only a small number of mismatches to avoid log spam. */ \
|
||||
if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 160) && i < 256ULL) { \
|
||||
printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f " \
|
||||
"exp_prev(from %d)=%f exp_next(from %d)=%f exp_self(from %d)=%f matched_sender=%d\n", \
|
||||
my_rank, num_ranks, i, \
|
||||
(float)result_buf[i], \
|
||||
prev_rank, (float)exp_prev, \
|
||||
next_rank, (float)exp_next, \
|
||||
self_rank, (float)exp_self, \
|
||||
matched); \
|
||||
} \
|
||||
\
|
||||
assert(result_buf[i] == test_buf[i]); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
#define TEST_DATA_SENDRECV(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) \
|
||||
test_data_sendrecv_##FuncNameType( \
|
||||
DataType* result_buf, \
|
||||
DataType* test_buf, \
|
||||
size_t num_elems, \
|
||||
int num_ranks, \
|
||||
int my_rank, \
|
||||
int seq) { \
|
||||
\
|
||||
int prev_rank = (my_rank - 1 + num_ranks) % num_ranks; \
|
||||
int next_rank = (my_rank + 1) % num_ranks; \
|
||||
\
|
||||
unsigned int seed_prev = \
|
||||
(unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq); \
|
||||
unsigned int seed_next = \
|
||||
(unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq); \
|
||||
\
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||
i < num_elems; \
|
||||
i += blockDim.x * gridDim.x) { \
|
||||
\
|
||||
seed_prev = ranqd1(seed_prev); \
|
||||
seed_next = ranqd1(seed_next); \
|
||||
\
|
||||
DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x); \
|
||||
DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x); \
|
||||
\
|
||||
if (result_buf[i] != exp_prev) { \
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0 && i < 8) { \
|
||||
printf("***rank=%d i=%zu result=%f prev(from %d)=%f next(from %d)=%f\n", \
|
||||
my_rank, i, (float)result_buf[i], \
|
||||
prev_rank, (float)exp_prev, \
|
||||
next_rank, (float)exp_next); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
test_buf[i] = exp_prev; \
|
||||
assert(result_buf[i] == test_buf[i]); \
|
||||
} \
|
||||
}
|
||||
*/
|
||||
TEST_DATA_SENDRECV(float16, __half)
|
||||
TEST_DATA_SENDRECV(float32, float)
|
||||
TEST_DATA_SENDRECV(int32, int)
|
||||
|
||||
Reference in New Issue
Block a user