diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 828ea4c2..ad641c85 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1479,9 +1479,10 @@ static void ggml_cuda_op_mul_mat_cublas( GGML_UNUSED(src1_padded_row_size); } -static void ggml_cuda_set_peer_access(int main_device) { +static bool ggml_cuda_set_peer_access(int main_device) { ggml_cuda_set_device(main_device); + bool all_enabled = true; for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) { if (main_device == id_other) { continue; @@ -1500,8 +1501,11 @@ static void ggml_cuda_set_peer_access(int main_device) { // reset the error (void)cudaGetLastError(); } + } else { + all_enabled = false; } } + return all_enabled; } static cudaError_t ggml_cuda_Memcpy2DPeerAsync( @@ -4453,7 +4457,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] con #if !defined(GGML_CUDA_NO_PEER_COPY) if (enable_p2p) { - ggml_cuda_set_peer_access(device); + ctx->p2p_enabled = ggml_cuda_set_peer_access(device); } #endif diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 27909321..c1034d8d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -843,6 +843,7 @@ struct ggml_backend_cuda_context { int device; std::string name; cudaEvent_t copy_event = nullptr; + bool p2p_enabled = false; cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index cf8d59e3..7b63bd47 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -176,7 +176,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ GGML_ASSERT(ii == nhave); GGML_ASSERT(have_this_device); } - if (nhave == 4 && dst->ne[1] <= 8) { + if (nhave == 4 && dst->ne[1] <= 8 && ctx.p2p_enabled) { for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; GGML_ASSERT(dst->src[i]->type == dst->type); @@ -241,7 +241,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ ggml_cuda_set_device(ctx.device); return; } - if (dst->ne[1] <= 8) { + if (dst->ne[1] <= 8 && ctx.p2p_enabled) { for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; GGML_ASSERT(dst->src[i]->type == dst->type);