Remove cuda-python from project (#245)

Remove cuda-python and use CuPy APIs instead

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
This commit is contained in:
Binyang Li
2024-02-13 05:44:11 -08:00
committed by GitHub
parent d97fef4395
commit 5971508eed
7 changed files with 71 additions and 76 deletions

View File

@@ -8,7 +8,6 @@ import subprocess
import tempfile
from typing import Any, Type
from cuda import cuda, nvrtc, cudart
import cupy as cp
import numpy as np
@@ -22,35 +21,14 @@ except ImportError:
torchTensor = Type[Any]
def _check_cuda_errors(result):
if result[0].value:
raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error(result[0])})")
if len(result) == 1:
return None
elif len(result) == 2:
return result[1]
else:
return result[1:]
def _cuda_get_error(error):
if isinstance(error, cuda.CUresult):
err, name = cuda.cuGetErrorName(error)
return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
elif isinstance(error, cudart.cudaError_t):
return cudart.cudaGetErrorName(error)[1]
elif isinstance(error, nvrtc.nvrtcResult):
return nvrtc.nvrtcGetErrorString(error)[1]
else:
raise RuntimeError("Unknown error type: {}".format(error))
class Kernel:
def __init__(self, ptx: bytes, kernel_name: str, device_id: int):
self._context = _check_cuda_errors(cuda.cuCtxGetCurrent())
assert self._context is not None
self._module = _check_cuda_errors(cuda.cuModuleLoadData(ptx))
self._kernel = _check_cuda_errors(cuda.cuModuleGetFunction(self._module, kernel_name.encode()))
CU_LAUNCH_PARAM_BUFFER_POINTER = 0x01
CU_LAUNCH_PARAM_BUFFER_SIZE = 0x02
CU_LAUNCH_PARAM_END = 0x00 if not cp.cuda.runtime.is_hip else 0x03
def __init__(self, ptx: bytes, kernel_name: str):
self._module = cp.cuda.driver.moduleLoadData(ptx)
self._kernel = cp.cuda.driver.moduleGetFunction(self._module, kernel_name)
def launch_kernel(
self,
@@ -58,26 +36,27 @@ class Kernel:
nblocks: int,
nthreads: int,
shared: int,
stream: Type[cuda.CUstream] or Type[cudart.cudaStream_t],
stream: Type[cp.cuda.Stream] or Type[None],
):
buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
buffer_size = ctypes.c_size_t(len(params))
config = np.array(
[
cuda.CU_LAUNCH_PARAM_BUFFER_POINTER,
Kernel.CU_LAUNCH_PARAM_BUFFER_POINTER,
ctypes.addressof(buffer),
cuda.CU_LAUNCH_PARAM_BUFFER_SIZE,
Kernel.CU_LAUNCH_PARAM_BUFFER_SIZE,
ctypes.addressof(buffer_size),
cuda.CU_LAUNCH_PARAM_END,
Kernel.CU_LAUNCH_PARAM_END,
],
dtype=np.uint64,
)
_check_cuda_errors(
cuda.cuLaunchKernel(self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, stream, 0, config.ctypes.data)
cuda_stream = stream.ptr if stream else 0
cp.cuda.driver.launchKernel(
self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, cuda_stream, 0, config.ctypes.data
)
def __del__(self):
cuda.cuModuleUnload(self._module)
cp.cuda.driver.moduleUnload(self._module)
class KernelBuilder:
@@ -96,35 +75,48 @@ class KernelBuilder:
self.macros = None
if file_dir:
self.macros = ["-D{}={}".format(macro, value) for macro, value in macro_dict.items()]
device_id = cp.cuda.Device().id
ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx", device_id)
self._kernel = Kernel(ptx, kernel_name, device_id)
ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx")
self._kernel = Kernel(ptx, kernel_name)
self.kernel_map[kernel_key] = self._kernel
def _compile_cuda(self, source_file, output_file, device_id, std_version="c++17"):
def _compile_cuda(self, source_file, output_file, std_version="c++17"):
mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
include_dir = os.path.join(mscclpp_home, "include")
major = _check_cuda_errors(
cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id)
)
minor = _check_cuda_errors(
cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device_id)
)
cuda_home = os.environ.get("CUDA_HOME")
nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
command = [
nvcc,
f"-std={std_version}",
"-ptx",
"-Xcompiler",
"-Wall,-Wextra",
f"-I{include_dir}",
f"{source_file}",
f"--gpu-architecture=compute_{major}{minor}",
f"--gpu-code=sm_{major}{minor},compute_{major}{minor}",
"-o",
f"{self._tempdir.name}/{output_file}",
]
if not cp.cuda.runtime.is_hip:
compute_capability = cp.cuda.Device().compute_capability
cuda_home = os.environ.get("CUDA_HOME")
nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
command = [
nvcc,
f"-std={std_version}",
"-ptx",
"-Xcompiler",
"-Wall,-Wextra",
f"-I{include_dir}",
f"{source_file}",
f"--gpu-architecture=compute_{compute_capability}",
f"--gpu-code=sm_{compute_capability},compute_{compute_capability}",
"-o",
f"{self._tempdir.name}/{output_file}",
]
else:
# the gcn arch name is like "gfx942:sramecc+:xnack-"
gcn_arch = (
cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8").split(":")[0]
)
rocm_home = os.environ.get("ROCM_HOME")
hipcc = os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
command = [
hipcc,
f"-std={std_version}",
"--genco",
"-D__HIP_PLATFORM_AMD__",
f"--offload-arch={gcn_arch}",
f"-I{include_dir}",
f"{source_file}",
"-o",
f"{self._tempdir.name}/{output_file}",
]
if self.macros:
command += self.macros
try:

View File

@@ -1,7 +1,11 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#if defined(__HIP_PLATFORM_AMD__)
#include <hip/hip_fp16.h>
#else
#include <cuda_fp16.h>
#endif
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/nvls_device.hpp>

View File

@@ -84,7 +84,7 @@ def check_correctness(memory, func, niter=100):
for p in range(niter):
memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
cp.cuda.runtime.deviceSynchronize()
output_memory = func(0)
output_memory = func(None)
cp.cuda.runtime.deviceSynchronize()
expected = cp.zeros_like(memory)
for i in range(MPI.COMM_WORLD.size):
@@ -110,7 +110,7 @@ def bench_time(niter: int, func):
with stream:
stream.begin_capture()
for i in range(niter):
func(stream.ptr)
func(stream)
graph = stream.end_capture()
# now run a warm up round

View File

@@ -66,8 +66,8 @@ class MscclppAllReduce1:
self.set_params(nblocks, block_size, read_only)
def __call__(self, stream_ptr):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
def __call__(self, stream):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
return self.memory
def set_params(self, nblocks, block_size, read_only):
@@ -131,8 +131,8 @@ class MscclppAllReduce2:
self.set_params(nblocks, block_size)
def __call__(self, stream_ptr):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
def __call__(self, stream):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
return self.memory_out
def set_params(self, nblocks, block_size):
@@ -201,8 +201,8 @@ class MscclppAllReduce3:
self.set_params(nblocks, block_size)
def __call__(self, stream_ptr):
self.kernel.launch_kernel(self.params, 24, 1024, 0, stream_ptr)
def __call__(self, stream):
self.kernel.launch_kernel(self.params, 24, 1024, 0, stream)
return self.memory
def set_params(self, nblocks, block_size):
@@ -295,8 +295,8 @@ class MscclppAllReduce4:
self.set_params(nblocks, block_size, pipeline_depth)
def __call__(self, stream_ptr):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
def __call__(self, stream):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
return self.memory
def set_params(self, nblocks, block_size, pipeline_depth):
@@ -388,8 +388,8 @@ class MscclppAllReduce5:
self.set_params(nblocks, block_size)
def __call__(self, stream_ptr):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
def __call__(self, stream):
self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
return self.memory_out
def set_params(self, nblocks, block_size):

View File

@@ -16,7 +16,8 @@ class NcclAllReduce:
else:
raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type")
def __call__(self, stream_ptr):
def __call__(self, stream):
stream_ptr = stream.ptr if stream else 0
self.nccl_comm.allReduce(
self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr
)

View File

@@ -1,7 +1,6 @@
mpi4py
cupy-cuda11x
prettytable
cuda-python
netifaces
pytest
numpy

View File

@@ -1,7 +1,6 @@
mpi4py
cupy-cuda12x
prettytable
cuda-python
netifaces
pytest
numpy