mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-04-20 06:48:59 +00:00
Updates to Python interface for PyPI packaging (#1209)
* Updates * Updates to notebooks
This commit is contained in:
@@ -7,7 +7,46 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Basic example of using the CUTLASS Python interface\n",
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n"
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/00_basic_gemm.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df94d7e6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites for running on Colab\n",
|
||||
"This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "71c7a069",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#nvidia-smi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cf16785d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c819bb68",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#pip install nvidia-cutlass"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -16,7 +55,8 @@
|
||||
"id": "962324fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We first import various packages needed for the example and construct the input and output tensors that will be used in our example.\n"
|
||||
"## General setup\n",
|
||||
"We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -7,14 +7,55 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Example of using elementwise activation functions in the CUTLASS Python interface\n",
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n"
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/01_epilogue.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3ca993fe",
|
||||
"id": "28c916da",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites for running on Colab\n",
|
||||
"This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0fcea8ea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#nvidia-smi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7ec60b57",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1db9e51c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#pip install nvidia-cutlass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "962324fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## General setup\n",
|
||||
"We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
|
||||
]
|
||||
},
|
||||
|
||||
@@ -10,6 +10,52 @@
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare\n",
|
||||
"a grouped GEMM kernel and export it as a PyTorch CUDA extension. Note that GEMM and Conv2d can also be exported as PyTorch CUDA extensions. \n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/02_pytorch_extension_grouped_gemm.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2d70560e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites for running on Colab\n",
|
||||
"This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cc7c7458",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#nvidia-smi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2107bb0d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If running on Colab, you will need to install the CUTLASS Python interface and PyTorch. To do so, uncomment the following line and run the cell:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a9852cb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#pip install nvidia-cutlass torch --extra-index-url https://download.pytorch.org/whl/cu121"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "962324fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Background on grouped GEMM\n",
|
||||
"Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides)\n",
|
||||
"in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM,\n",
|
||||
|
||||
@@ -8,6 +8,48 @@
|
||||
"\n",
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run Conv2d. \n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/03_basic_conv2d.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites for running on Colab\n",
|
||||
"This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#nvidia-smi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#pip install nvidia-cutlass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## General setup\n",
|
||||
"We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
|
||||
]
|
||||
},
|
||||
|
||||
@@ -7,14 +7,55 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Example of using epilogue visitor in the CUTLASS Python interface\n",
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor."
|
||||
"This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor.\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/04_epilogue_visitor.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3ca993fe",
|
||||
"id": "3a800e79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites for running on Colab\n",
|
||||
"This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9cfff2c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#nvidia-smi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "06706f00",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "491a7314",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!#pip install nvidia-cutlass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "962324fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## General setup\n",
|
||||
"We first import various packages needed for the example, construct the input and output tensors that will be used in our example."
|
||||
]
|
||||
},
|
||||
|
||||
@@ -3,11 +3,12 @@ requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "cutlass"
|
||||
name = "nvidia-cutlass"
|
||||
version = "3.3.0.0"
|
||||
description = "CUTLASS"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
license = {file = "LICENSE.txt"}
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: BSD License",
|
||||
|
||||
@@ -18,9 +18,6 @@ A, B, C, D = [np.ones((4096, 4096), dtype=np.float16) for i in range(4)]
|
||||
plan.run(A, B, C, D)
|
||||
```
|
||||
|
||||
**NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future.
|
||||
We welcome feedback from the community.
|
||||
|
||||
### Overview
|
||||
The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
|
||||
the CUTLASS Python interface attempts to:
|
||||
@@ -87,12 +84,17 @@ If these environment variables are not set, the installation process will infer
|
||||
**NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
|
||||
|
||||
#### Installation
|
||||
The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing
|
||||
Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
|
||||
```bash
|
||||
pip install nvidia-cutlass
|
||||
```
|
||||
|
||||
The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing
|
||||
```bash
|
||||
pip install .
|
||||
```
|
||||
|
||||
If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform:
|
||||
If you would like to be able to make changes to CUTLASS Python interface and have them reflected when using the interface, perform:
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
@@ -85,7 +85,15 @@ this = sys.modules[__name__]
|
||||
this.logger = logging.getLogger(__name__)
|
||||
|
||||
# RMM is only supported for Python 3.9+
|
||||
this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3
|
||||
if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3:
|
||||
try:
|
||||
import rmm
|
||||
this.use_rmm = True
|
||||
except ImportError:
|
||||
this.use_rmm = False
|
||||
else:
|
||||
this.use_rmm = False
|
||||
|
||||
|
||||
def set_log_level(level: int):
|
||||
"""
|
||||
@@ -134,9 +142,8 @@ def get_memory_pool():
|
||||
return this.memory_pool
|
||||
|
||||
|
||||
from cuda import cuda
|
||||
from cuda import cuda, cudart
|
||||
|
||||
this._context = None
|
||||
this._device_id = None
|
||||
def initialize_cuda_context():
|
||||
if this._device_id is not None:
|
||||
@@ -149,10 +156,10 @@ def initialize_cuda_context():
|
||||
device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
|
||||
if device_id is None:
|
||||
if not this.use_rmm:
|
||||
# We must manually call cuInit in the absence of RMM
|
||||
err, = cuda.cuInit(0)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise Exception(f"cuInit failed with error {err}")
|
||||
# Manually call cuInit() and create context by making a runtime API call
|
||||
err, = cudart.cudaFree(0)
|
||||
if err != cudart.cudaError_t.cudaSuccess:
|
||||
raise RuntimeError(f"cudaFree failed with error {err}")
|
||||
|
||||
err, device_count = cuda.cuDeviceGetCount()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
@@ -163,16 +170,6 @@ def initialize_cuda_context():
|
||||
|
||||
this._device_id = device_id
|
||||
|
||||
if not this.use_rmm and this._context is None:
|
||||
# We must manually initialize the context in the absence of RMM
|
||||
err, device = cuda.cuDeviceGet(this._device_id)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise Exception(f"cuDeviceGet failed with error {err}")
|
||||
|
||||
err, this._context = cuda.cuCtxCreate(0, device)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise Exception(f"cuCtxCreate failed with error {err}")
|
||||
|
||||
|
||||
def device_id() -> int:
|
||||
initialize_cuda_context()
|
||||
|
||||
@@ -53,7 +53,7 @@ IncludeTemplate = r"""#include "${include}"
|
||||
def compile_with_nvcc(cmd, source, error_file):
|
||||
succeed = True
|
||||
try:
|
||||
subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
|
||||
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
error_message = e.output.decode()
|
||||
with open(error_file, "w") as error_out:
|
||||
@@ -82,20 +82,19 @@ class CompilationOptions:
|
||||
self.arch = arch
|
||||
|
||||
def get_str(self):
|
||||
options = ""
|
||||
|
||||
opts = []
|
||||
for flag in self.flags:
|
||||
options += " " + flag
|
||||
opts.append(flag)
|
||||
|
||||
for incl in self.include_paths:
|
||||
options += " --include-path=%s" % incl
|
||||
opts.append(f"--include-path={incl}")
|
||||
|
||||
arch_flag = " -arch=sm_%d" % self.arch
|
||||
arch_flag = f"-arch=sm_{self.arch}"
|
||||
if self.arch == 90:
|
||||
arch_flag += "a"
|
||||
options += arch_flag
|
||||
opts.append(arch_flag)
|
||||
|
||||
return options
|
||||
return " ".join(opts)
|
||||
|
||||
def get(self):
|
||||
options = []
|
||||
@@ -104,9 +103,9 @@ class CompilationOptions:
|
||||
options.append(bytes(str.encode(flag)))
|
||||
|
||||
for incl in self.include_paths:
|
||||
options.append(bytes(str.encode("--include-path=%s" % incl)))
|
||||
options.append(bytes(str.encode(f" --include-path={incl}")))
|
||||
|
||||
arch_flag = " -arch=sm_%d" % self.arch
|
||||
arch_flag = f" -arch=sm_{self.arch}"
|
||||
if self.arch == 90:
|
||||
arch_flag += "a"
|
||||
|
||||
@@ -323,34 +322,35 @@ class ArtifactManager:
|
||||
"tarfile": temp_cubin.name,
|
||||
}
|
||||
cmd = SubstituteTemplate(cmd_template, values)
|
||||
compile_with_nvcc(cmd, source_buffer_device, "./cutlass_python_compilation_device_error.txt")
|
||||
compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
|
||||
|
||||
# load the cubin image
|
||||
with open(temp_cubin.name, "rb") as file:
|
||||
cubin_image = file.read()
|
||||
|
||||
# Set up the host-side library code
|
||||
cmd_template = (
|
||||
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
|
||||
% source_buffer_host
|
||||
)
|
||||
cmd = SubstituteTemplate(
|
||||
cmd_template,
|
||||
{
|
||||
"cuda_install_path": cuda_install_path(),
|
||||
"options": host_compilation_options.get_str(),
|
||||
},
|
||||
)
|
||||
|
||||
tempfile.tempdir = "./"
|
||||
temp = tempfile.NamedTemporaryFile(
|
||||
temp_src = tempfile.NamedTemporaryFile(
|
||||
prefix="host_src", suffix=".cu", delete=True)
|
||||
|
||||
# Write the host source
|
||||
with open(temp_src.name, "w") as outfile:
|
||||
outfile.write(source_buffer_host)
|
||||
|
||||
temp_dst = tempfile.NamedTemporaryFile(
|
||||
prefix="host_func", suffix=".so", delete=True)
|
||||
|
||||
cmd += " - -shared -o %s -lcudart -lcuda" % temp.name
|
||||
compile_with_nvcc(cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
|
||||
host_lib = ctypes.CDLL(temp.name)
|
||||
# Set up host compilation arguments
|
||||
cmd = []
|
||||
cmd.append(f"{cuda_install_path()}/bin/nvcc")
|
||||
cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
|
||||
cmd.extend(host_compilation_options.get_str().split(" "))
|
||||
cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
|
||||
|
||||
return cubin_image, host_lib, temp
|
||||
# Comile and load the library
|
||||
compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
|
||||
host_lib = ctypes.CDLL(temp_dst.name)
|
||||
|
||||
return cubin_image, host_lib, temp_dst
|
||||
|
||||
def add_module(self, operations, compile_options=None, bypass_cache=False):
|
||||
"""
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
# Installation
|
||||
|
||||
## Installing a stable release
|
||||
|
||||
Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
|
||||
```bash
|
||||
pip install nvidia-cutlass
|
||||
```
|
||||
|
||||
## Installing from source
|
||||
|
||||
Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[metadata]
|
||||
name = cutlass
|
||||
name = nvidia-cutlass
|
||||
version = 3.3.0.0
|
||||
|
||||
[options]
|
||||
@@ -28,3 +28,6 @@ include_package_data = True
|
||||
|
||||
[options.package_data]
|
||||
cutlass_library.source = include/**/*, examples/**/*, tools/**/*
|
||||
|
||||
[options.exclude_package_data]
|
||||
cutlass_library.source = include/**/*.py, examples/**/*.py, tools/**/*.py
|
||||
Reference in New Issue
Block a user