Remove cuda-python from project (#245)

Remove cuda-python and use CuPy APIs instead --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2026-04-19 14:29:13 +00:00 · 2024-02-13 05:44:11 -08:00
parent d97fef4395
commit 5971508eed
7 changed files with 71 additions and 76 deletions
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -8,7 +8,6 @@ import subprocess
 import tempfile
 from typing import Any, Type

-from cuda import cuda, nvrtc, cudart
 import cupy as cp
 import numpy as np

@@ -22,35 +21,14 @@ except ImportError:
    torchTensor = Type[Any]


-def _check_cuda_errors(result):
-    if result[0].value:
-        raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error(result[0])})")
-    if len(result) == 1:
-        return None
-    elif len(result) == 2:
-        return result[1]
-    else:
-        return result[1:]
-
-
-def _cuda_get_error(error):
-    if isinstance(error, cuda.CUresult):
-        err, name = cuda.cuGetErrorName(error)
-        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
-    elif isinstance(error, cudart.cudaError_t):
-        return cudart.cudaGetErrorName(error)[1]
-    elif isinstance(error, nvrtc.nvrtcResult):
-        return nvrtc.nvrtcGetErrorString(error)[1]
-    else:
-        raise RuntimeError("Unknown error type: {}".format(error))
-
-
 class Kernel:
-    def __init__(self, ptx: bytes, kernel_name: str, device_id: int):
-        self._context = _check_cuda_errors(cuda.cuCtxGetCurrent())
-        assert self._context is not None
-        self._module = _check_cuda_errors(cuda.cuModuleLoadData(ptx))
-        self._kernel = _check_cuda_errors(cuda.cuModuleGetFunction(self._module, kernel_name.encode()))
+    CU_LAUNCH_PARAM_BUFFER_POINTER = 0x01
+    CU_LAUNCH_PARAM_BUFFER_SIZE = 0x02
+    CU_LAUNCH_PARAM_END = 0x00 if not cp.cuda.runtime.is_hip else 0x03
+
+    def __init__(self, ptx: bytes, kernel_name: str):
+        self._module = cp.cuda.driver.moduleLoadData(ptx)
+        self._kernel = cp.cuda.driver.moduleGetFunction(self._module, kernel_name)

    def launch_kernel(
        self,
@@ -58,26 +36,27 @@ class Kernel:
        nblocks: int,
        nthreads: int,
        shared: int,
-        stream: Type[cuda.CUstream] or Type[cudart.cudaStream_t],
+        stream: Type[cp.cuda.Stream] or Type[None],
    ):
        buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
        buffer_size = ctypes.c_size_t(len(params))
        config = np.array(
            [
-                cuda.CU_LAUNCH_PARAM_BUFFER_POINTER,
+                Kernel.CU_LAUNCH_PARAM_BUFFER_POINTER,
                ctypes.addressof(buffer),
-                cuda.CU_LAUNCH_PARAM_BUFFER_SIZE,
+                Kernel.CU_LAUNCH_PARAM_BUFFER_SIZE,
                ctypes.addressof(buffer_size),
-                cuda.CU_LAUNCH_PARAM_END,
+                Kernel.CU_LAUNCH_PARAM_END,
            ],
            dtype=np.uint64,
        )
-        _check_cuda_errors(
-            cuda.cuLaunchKernel(self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, stream, 0, config.ctypes.data)
+        cuda_stream = stream.ptr if stream else 0
+        cp.cuda.driver.launchKernel(
+            self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, cuda_stream, 0, config.ctypes.data
        )

    def __del__(self):
-        cuda.cuModuleUnload(self._module)
+        cp.cuda.driver.moduleUnload(self._module)


 class KernelBuilder:
@@ -96,35 +75,48 @@ class KernelBuilder:
        self.macros = None
        if file_dir:
            self.macros = ["-D{}={}".format(macro, value) for macro, value in macro_dict.items()]
-        device_id = cp.cuda.Device().id
-        ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx", device_id)
-        self._kernel = Kernel(ptx, kernel_name, device_id)
+        ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx")
+        self._kernel = Kernel(ptx, kernel_name)
        self.kernel_map[kernel_key] = self._kernel

-    def _compile_cuda(self, source_file, output_file, device_id, std_version="c++17"):
+    def _compile_cuda(self, source_file, output_file, std_version="c++17"):
        mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
        include_dir = os.path.join(mscclpp_home, "include")
-        major = _check_cuda_errors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id)
-        )
-        minor = _check_cuda_errors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device_id)
-        )
-        cuda_home = os.environ.get("CUDA_HOME")
-        nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
-        command = [
-            nvcc,
-            f"-std={std_version}",
-            "-ptx",
-            "-Xcompiler",
-            "-Wall,-Wextra",
-            f"-I{include_dir}",
-            f"{source_file}",
-            f"--gpu-architecture=compute_{major}{minor}",
-            f"--gpu-code=sm_{major}{minor},compute_{major}{minor}",
-            "-o",
-            f"{self._tempdir.name}/{output_file}",
-        ]
+        if not cp.cuda.runtime.is_hip:
+            compute_capability = cp.cuda.Device().compute_capability
+            cuda_home = os.environ.get("CUDA_HOME")
+            nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
+            command = [
+                nvcc,
+                f"-std={std_version}",
+                "-ptx",
+                "-Xcompiler",
+                "-Wall,-Wextra",
+                f"-I{include_dir}",
+                f"{source_file}",
+                f"--gpu-architecture=compute_{compute_capability}",
+                f"--gpu-code=sm_{compute_capability},compute_{compute_capability}",
+                "-o",
+                f"{self._tempdir.name}/{output_file}",
+            ]
+        else:
+            # the gcn arch name is like "gfx942:sramecc+:xnack-"
+            gcn_arch = (
+                cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8").split(":")[0]
+            )
+            rocm_home = os.environ.get("ROCM_HOME")
+            hipcc = os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
+            command = [
+                hipcc,
+                f"-std={std_version}",
+                "--genco",
+                "-D__HIP_PLATFORM_AMD__",
+                f"--offload-arch={gcn_arch}",
+                f"-I{include_dir}",
+                f"{source_file}",
+                "-o",
+                f"{self._tempdir.name}/{output_file}",
+            ]
        if self.macros:
            command += self.macros
        try:
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -1,7 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.

+#if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_fp16.h>
+#else
 #include <cuda_fp16.h>
+#endif

 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/nvls_device.hpp>
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -84,7 +84,7 @@ def check_correctness(memory, func, niter=100):
    for p in range(niter):
        memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
        cp.cuda.runtime.deviceSynchronize()
-        output_memory = func(0)
+        output_memory = func(None)
        cp.cuda.runtime.deviceSynchronize()
        expected = cp.zeros_like(memory)
        for i in range(MPI.COMM_WORLD.size):
@@ -110,7 +110,7 @@ def bench_time(niter: int, func):
    with stream:
        stream.begin_capture()
        for i in range(niter):
-            func(stream.ptr)
+            func(stream)
        graph = stream.end_capture()

    # now run a warm up round
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -66,8 +66,8 @@ class MscclppAllReduce1:

        self.set_params(nblocks, block_size, read_only)

-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
        return self.memory

    def set_params(self, nblocks, block_size, read_only):
@@ -131,8 +131,8 @@ class MscclppAllReduce2:

        self.set_params(nblocks, block_size)

-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
        return self.memory_out

    def set_params(self, nblocks, block_size):
@@ -201,8 +201,8 @@ class MscclppAllReduce3:

        self.set_params(nblocks, block_size)

-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, 24, 1024, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, 24, 1024, 0, stream)
        return self.memory

    def set_params(self, nblocks, block_size):
@@ -295,8 +295,8 @@ class MscclppAllReduce4:

        self.set_params(nblocks, block_size, pipeline_depth)

-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
        return self.memory

    def set_params(self, nblocks, block_size, pipeline_depth):
@@ -388,8 +388,8 @@ class MscclppAllReduce5:

        self.set_params(nblocks, block_size)

-    def __call__(self, stream_ptr):
-        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
        return self.memory_out

    def set_params(self, nblocks, block_size):
--- a/python/mscclpp_benchmark/nccl_op.py
+++ b/python/mscclpp_benchmark/nccl_op.py
@@ -16,7 +16,8 @@ class NcclAllReduce:
        else:
            raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type")

-    def __call__(self, stream_ptr):
+    def __call__(self, stream):
+        stream_ptr = stream.ptr if stream else 0
        self.nccl_comm.allReduce(
            self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr
        )
--- a/python/requirements_cu11.txt
+++ b/python/requirements_cu11.txt
@@ -1,7 +1,6 @@
 mpi4py
 cupy-cuda11x
 prettytable
-cuda-python
 netifaces
 pytest
 numpy
--- a/python/requirements_cu12.txt
+++ b/python/requirements_cu12.txt
@@ -1,7 +1,6 @@
 mpi4py
 cupy-cuda12x
 prettytable
-cuda-python
 netifaces
 pytest
 numpy