diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py index 70569b7..80a94e9 100644 --- a/python/examples/auto_throughput.py +++ b/python/examples/auto_throughput.py @@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) -def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel: +def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: @cuda.jit def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): tid = cuda.grid(1) @@ -59,7 +59,8 @@ def throughput_bench(state: nvbench.State) -> None: krn = make_kernel(ipt) # warm-up call ensures that kernel is loaded into context - # before blocking kernel is launched + # before blocking kernel is launched. Kernel loading may cause + # a synchronization to occur. krn[blocks_in_grid, threads_per_block, alloc_stream, 0]( stride, elements, inp_arr, out_arr ) diff --git a/python/examples/axes.py b/python/examples/axes.py index 5adc03a..f01607b 100644 --- a/python/examples/axes.py +++ b/python/examples/axes.py @@ -1,3 +1,19 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import ctypes import sys from typing import Dict, Optional, Tuple diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py index dd77f46..58586be 100644 --- a/python/examples/cccl_parallel_segmented_reduce.py +++ b/python/examples/cccl_parallel_segmented_reduce.py @@ -1,3 +1,19 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import cuda.cccl.parallel.experimental.algorithms as algorithms diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py index d51f6ad..16f70cc 100644 --- a/python/examples/cpu_activity.py +++ b/python/examples/cpu_activity.py @@ -1,3 +1,19 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import time diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py index d1e86ef..59177bc 100644 --- a/python/examples/cupy_extract.py +++ b/python/examples/cupy_extract.py @@ -1,3 +1,19 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import cuda.nvbench as nvbench diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py new file mode 100644 index 0000000..bba8633 --- /dev/null +++ b/python/examples/cutlass_gemm.py @@ -0,0 +1,112 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys + +import cuda.bindings.driver as driver +import cuda.core.experimental as core +import cupy as cp +import cutlass +import numpy as np + +import nvbench + + +def as_bindings_Stream(cs: nvbench.CudaStream) -> driver.CUstream: + return driver.CUstream(cs.addressof()) + + +def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: + return core.Stream.from_handle(cs.addressof()) + + +def make_cp_array(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int) -> cp.ndarray: + cp_memview = cp.cuda.UnownedMemory( + int(dev_buf.handle), dev_buf.size, dev_buf, dev_id + ) + zero_offset = 0 + return cp.ndarray( + arr_h.shape, + dtype=arr_h.dtype, + memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset), + ) + + +def cutlass_gemm(state: nvbench.State) -> None: + n = state.get_int64("N") + r = state.get_int64("R") + + alpha = state.get_float64("alpha") + + dt = np.float64 + A_h = np.random.randn(n, r).astype(dt) + B_h = np.copy(A_h.mT) + C_h = np.eye(n, dtype=dt) + D_h = np.zeros_like(C_h) + + if n >= 1024: + # allow more time for large inputs + state.set_timeout(360) + + dev_id = state.get_device() + cs = state.get_stream() + s = as_bindings_Stream(cs) + core_s = as_core_Stream(cs) + + A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s) + B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s) + C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s) + D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s) + + driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s) + driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s) + driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s) + driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s) + + A_cp = make_cp_array(A_h, A_d, dev_id) + B_cp = make_cp_array(B_h, B_d, dev_id) + C_cp = make_cp_array(C_h, C_d, dev_id) + D_cp = make_cp_array(D_h, D_d, dev_id) + + plan = cutlass.op.Gemm( + A=A_cp, + B=B_cp, + C=C_cp, + D=D_cp, + element=dt, + alpha=alpha, + beta=1, + layout=cutlass.LayoutType.RowMajor, + ) + # warm-up to ensure compilation is not timed + plan.run(stream=s) + + def launcher(launch: nvbench.Launch) -> None: + s = as_bindings_Stream(launch.get_stream()) + plan.run(stream=s, sync=False) + + state.exec(launcher) + + +if __name__ == "__main__": + gemm_b = nvbench.register(cutlass_gemm) + gemm_b.add_int64_axis("R", [16, 64, 256]) + gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) + + gemm_b.add_float64_axis("alpha", [1e-2]) + + nvbench.run_all_benchmarks(sys.argv) diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py index 9315983..8d0789a 100644 --- a/python/examples/exec_tag_sync.py +++ b/python/examples/exec_tag_sync.py @@ -1,3 +1,19 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import ctypes import sys from typing import Optional diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt new file mode 100644 index 0000000..35a9c48 --- /dev/null +++ b/python/examples/requirements.txt @@ -0,0 +1,7 @@ +numpy +numba +cupy +nvidia-cutlass +cuda-cccl +cuda-core +cuda-bindings diff --git a/python/examples/skip.py b/python/examples/skip.py index bb75b57..a5555d0 100644 --- a/python/examples/skip.py +++ b/python/examples/skip.py @@ -1,3 +1,19 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import cuda.cccl.headers as headers diff --git a/python/examples/throughput.py b/python/examples/throughput.py index 64b557b..3ae5c1d 100644 --- a/python/examples/throughput.py +++ b/python/examples/throughput.py @@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) -def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel: +def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: @cuda.jit def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): tid = cuda.grid(1) @@ -59,6 +59,13 @@ def throughput_bench(state: nvbench.State) -> None: krn = make_kernel(ipt) + # warm-up call ensures that kernel is loaded into context + # before blocking kernel is launched. Kernel loading may + # cause synchronization to occur. + krn[blocks_in_grid, threads_per_block, alloc_stream, 0]( + stride, elements, inp_arr, out_arr + ) + def launcher(launch: nvbench.Launch): exec_stream = as_cuda_Stream(launch.get_stream()) krn[blocks_in_grid, threads_per_block, exec_stream, 0](