mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-19 22:38:52 +00:00
Change test and examples from using camelCase to using snake_case as implementation changed
This commit is contained in:
@@ -41,18 +41,18 @@ def make_kernel(items_per_thread: int) -> Callable:
|
||||
|
||||
|
||||
def throughput_bench(state: nvbench.State) -> None:
|
||||
stride = state.getInt64("Stride")
|
||||
ipt = state.getInt64("ItemsPerThread")
|
||||
stride = state.get_int64("Stride")
|
||||
ipt = state.get_int64("ItemsPerThread")
|
||||
|
||||
nbytes = 128 * 1024 * 1024
|
||||
elements = nbytes // np.dtype(np.int32).itemsize
|
||||
|
||||
alloc_stream = as_cuda_Stream(state.getStream())
|
||||
alloc_stream = as_cuda_Stream(state.get_stream())
|
||||
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
|
||||
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
|
||||
|
||||
state.addElementCount(elements, column_name="Elements")
|
||||
state.collectCUPTIMetrics()
|
||||
state.add_element_count(elements, column_name="Elements")
|
||||
state.collect_cupti_metrics()
|
||||
|
||||
threads_per_block = 256
|
||||
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
|
||||
@@ -66,7 +66,7 @@ def throughput_bench(state: nvbench.State) -> None:
|
||||
)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
exec_stream = as_cuda_Stream(launch.getStream())
|
||||
exec_stream = as_cuda_Stream(launch.get_stream())
|
||||
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
|
||||
stride, elements, inp_arr, out_arr
|
||||
)
|
||||
@@ -76,7 +76,7 @@ def throughput_bench(state: nvbench.State) -> None:
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = nvbench.register(throughput_bench)
|
||||
b.addInt64Axis("Stride", [1, 2, 4])
|
||||
b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
|
||||
b.add_int64_axis("Stride", [1, 2, 4])
|
||||
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -43,13 +43,13 @@ __global__ void sleep_kernel(double seconds) {
|
||||
|
||||
|
||||
def simple(state: nvbench.State):
|
||||
state.setMinSamples(1000)
|
||||
state.set_min_samples(1000)
|
||||
sleep_dur = 1e-3
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_core_Stream(launch.getStream())
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher)
|
||||
@@ -57,12 +57,13 @@ def simple(state: nvbench.State):
|
||||
|
||||
def single_float64_axis(state: nvbench.State):
|
||||
# get axis value, or default
|
||||
sleep_dur = state.getFloat64("Duration", 3.14e-4)
|
||||
default_sleep_dur = 3.14e-4
|
||||
sleep_dur = state.get_float64("Duration", default_sleep_dur)
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_core_Stream(launch.getStream())
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher)
|
||||
@@ -104,19 +105,19 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
|
||||
|
||||
|
||||
def copy_sweep_grid_shape(state: nvbench.State):
|
||||
block_size = state.getInt64("BlockSize")
|
||||
num_blocks = state.getInt64("NumBlocks")
|
||||
block_size = state.get_int64("BlockSize")
|
||||
num_blocks = state.get_int64("NumBlocks")
|
||||
|
||||
# Number of int32 elements in 256MiB
|
||||
nbytes = 256 * 1024 * 1024
|
||||
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
|
||||
|
||||
state.addElementCount(num_values)
|
||||
state.addGlobalMemoryReads(nbytes)
|
||||
state.addGlobalMemoryWrites(nbytes)
|
||||
state.add_element_count(num_values)
|
||||
state.add_global_memory_reads(nbytes)
|
||||
state.add_global_memory_writes(nbytes)
|
||||
|
||||
dev_id = state.getDevice()
|
||||
alloc_s = as_core_Stream(state.getStream())
|
||||
dev_id = state.get_device()
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
|
||||
@@ -124,20 +125,20 @@ def copy_sweep_grid_shape(state: nvbench.State):
|
||||
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_core_Stream(launch.getStream())
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def copy_type_sweep(state: nvbench.State):
|
||||
type_id = state.getInt64("TypeID")
|
||||
type_id = state.get_int64("TypeID")
|
||||
|
||||
types_map = {
|
||||
0: (ctypes.c_uint8, "::cuda::std::uint8_t"),
|
||||
1: (ctypes.c_uint16, "::cuda::std::uint16_t"),
|
||||
2: (ctypes.c_uint32, "::cuda::std::uint32_t"),
|
||||
3: (ctypes.c_uint64, "::cuda::std::uint64_t"),
|
||||
0: (ctypes.c_uint8, "cuda::std::uint8_t"),
|
||||
1: (ctypes.c_uint16, "cuda::std::uint16_t"),
|
||||
2: (ctypes.c_uint32, "cuda::std::uint32_t"),
|
||||
3: (ctypes.c_uint64, "cuda::std::uint64_t"),
|
||||
4: (ctypes.c_float, "float"),
|
||||
5: (ctypes.c_double, "double"),
|
||||
}
|
||||
@@ -149,12 +150,12 @@ def copy_type_sweep(state: nvbench.State):
|
||||
nbytes = 256 * 1024 * 1024
|
||||
num_values = nbytes // ctypes.sizeof(value_ctype(0))
|
||||
|
||||
state.addElementCount(num_values)
|
||||
state.addGlobalMemoryReads(nbytes)
|
||||
state.addGlobalMemoryWrites(nbytes)
|
||||
state.add_element_count(num_values)
|
||||
state.add_global_memory_reads(nbytes)
|
||||
state.add_global_memory_writes(nbytes)
|
||||
|
||||
dev_id = state.getDevice()
|
||||
alloc_s = as_core_Stream(state.getStream())
|
||||
dev_id = state.get_device()
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
|
||||
@@ -162,7 +163,7 @@ def copy_type_sweep(state: nvbench.State):
|
||||
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_core_Stream(launch.getStream())
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
|
||||
|
||||
state.exec(launcher)
|
||||
@@ -175,13 +176,15 @@ if __name__ == "__main__":
|
||||
# benchmark with no axes, that uses default value
|
||||
nvbench.register(default_value)
|
||||
# specify axis
|
||||
nvbench.register(single_float64_axis).addFloat64Axis("Duration", [7e-5, 1e-4, 5e-4])
|
||||
nvbench.register(single_float64_axis).add_float64_axis(
|
||||
"Duration", [7e-5, 1e-4, 5e-4]
|
||||
)
|
||||
|
||||
copy1_bench = nvbench.register(copy_sweep_grid_shape)
|
||||
copy1_bench.addInt64Axis("BlockSize", [2**x for x in range(6, 10, 2)])
|
||||
copy1_bench.addInt64Axis("NumBlocks", [2**x for x in range(6, 10, 2)])
|
||||
copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
|
||||
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
|
||||
|
||||
copy2_bench = nvbench.register(copy_type_sweep)
|
||||
copy2_bench.addInt64Axis("TypeID", range(0, 6))
|
||||
copy2_bench.add_int64_axis("TypeID", range(0, 6))
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -35,15 +35,15 @@ def as_cp_ExternalStream(
|
||||
|
||||
def segmented_reduce(state: nvbench.State):
|
||||
"Benchmark segmented_reduce example"
|
||||
n_elems = state.getInt64("numElems")
|
||||
n_cols = state.getInt64("numCols")
|
||||
n_elems = state.get_int64("numElems")
|
||||
n_cols = state.get_int64("numCols")
|
||||
n_rows = n_elems // n_cols
|
||||
|
||||
state.add_summary("numRows", n_rows)
|
||||
state.collectCUPTIMetrics()
|
||||
state.collect_cupti_metrics()
|
||||
|
||||
dev_id = state.getDevice()
|
||||
cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
|
||||
dev_id = state.get_device()
|
||||
cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
|
||||
|
||||
with cp_stream:
|
||||
rng = cp.random.default_rng()
|
||||
@@ -75,20 +75,19 @@ def segmented_reduce(state: nvbench.State):
|
||||
d_input, d_output, start_offsets, end_offsets, add_op, h_init
|
||||
)
|
||||
|
||||
# print(1)
|
||||
cccl_stream = as_cccl_Stream(state.getStream())
|
||||
# print(2, core_stream, core_stream.__cuda_stream__())
|
||||
cccl_stream = as_cccl_Stream(state.get_stream())
|
||||
|
||||
# query size of temporary storage and allocate
|
||||
temp_nbytes = alg(
|
||||
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
|
||||
)
|
||||
h_init = np.zeros(tuple(), dtype=np.int32)
|
||||
# print(3)
|
||||
|
||||
with cp_stream:
|
||||
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_cccl_Stream(launch.getStream())
|
||||
s = as_cccl_Stream(launch.get_stream())
|
||||
alg(
|
||||
temp_storage,
|
||||
d_input,
|
||||
@@ -105,7 +104,7 @@ def segmented_reduce(state: nvbench.State):
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = nvbench.register(segmented_reduce)
|
||||
b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
|
||||
b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
|
||||
b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
|
||||
b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -13,6 +13,6 @@ def throughput_bench(state: nvbench.State) -> None:
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = nvbench.register(throughput_bench)
|
||||
b.setIsCPUOnly(True)
|
||||
b.set_is_cpu_only(True)
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -12,18 +12,18 @@ def as_cp_ExternalStream(
|
||||
|
||||
|
||||
def cupy_extract_by_mask(state: nvbench.State):
|
||||
n_cols = state.getInt64("numCols")
|
||||
n_rows = state.getInt64("numRows")
|
||||
n_cols = state.get_int64("numCols")
|
||||
n_rows = state.get_int64("numRows")
|
||||
|
||||
dev_id = state.getDevice()
|
||||
cp_s = as_cp_ExternalStream(state.getStream(), dev_id)
|
||||
dev_id = state.get_device()
|
||||
cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
|
||||
|
||||
state.collectCUPTIMetrics()
|
||||
state.addElementCount(n_rows * n_cols, "# Elements")
|
||||
state.addGlobalMemoryReads(
|
||||
state.collect_cupti_metrics()
|
||||
state.add_element_count(n_rows * n_cols, "# Elements")
|
||||
state.add_global_memory_reads(
|
||||
n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize)
|
||||
)
|
||||
state.addGlobalMemoryWrites(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
|
||||
state.add_global_memory_writes(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
|
||||
|
||||
with cp_s:
|
||||
X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32)
|
||||
@@ -31,7 +31,7 @@ def cupy_extract_by_mask(state: nvbench.State):
|
||||
_ = X[mask]
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
with as_cp_ExternalStream(launch.getStream(), dev_id):
|
||||
with as_cp_ExternalStream(launch.get_stream(), dev_id):
|
||||
_ = X[mask]
|
||||
|
||||
state.exec(launcher, sync=True)
|
||||
@@ -39,7 +39,7 @@ def cupy_extract_by_mask(state: nvbench.State):
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = nvbench.register(cupy_extract_by_mask)
|
||||
b.addInt64Axis("numCols", [1024, 2048, 4096, 2 * 4096])
|
||||
b.addInt64Axis("numRows", [1024, 2048, 4096, 2 * 4096])
|
||||
b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
|
||||
b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -45,17 +45,17 @@ def synchronizing_bench(state: nvbench.State):
|
||||
n_values = 64 * 1024 * 1024
|
||||
n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
|
||||
|
||||
alloc_s = as_core_Stream(state.getStream())
|
||||
buffer = core.DeviceMemoryResource(state.getDevice()).allocate(n_bytes, alloc_s)
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)
|
||||
|
||||
state.addElementCount(n_values, "Items")
|
||||
state.addGlobalMemoryWrites(n_bytes, "Size")
|
||||
state.add_element_count(n_values, "Items")
|
||||
state.add_global_memory_writes(n_bytes, "Size")
|
||||
|
||||
krn = make_fill_kernel()
|
||||
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_core_Stream(launch.getStream())
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, buffer, 0, n_values)
|
||||
s.sync()
|
||||
|
||||
|
||||
@@ -42,8 +42,8 @@ __global__ void sleep_kernel(double seconds) {
|
||||
|
||||
|
||||
def runtime_skip(state: nvbench.State):
|
||||
duration = state.getFloat64("Duration")
|
||||
kramble = state.getString("Kramble")
|
||||
duration = state.get_float64("Duration")
|
||||
kramble = state.get_string("Kramble")
|
||||
|
||||
# Skip Baz benchmarks with 0.8 ms duration
|
||||
if kramble == "Baz" and duration < 0.8e-3:
|
||||
@@ -59,7 +59,7 @@ def runtime_skip(state: nvbench.State):
|
||||
launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
s = as_core_Stream(launch.getStream())
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_cfg, krn, duration)
|
||||
|
||||
state.exec(launcher)
|
||||
@@ -67,7 +67,7 @@ def runtime_skip(state: nvbench.State):
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = nvbench.register(runtime_skip)
|
||||
b.addFloat64Axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
|
||||
b.addStringAxis("Kramble", ["Foo", "Bar", "Baz"])
|
||||
b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
|
||||
b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -41,33 +41,27 @@ def make_kernel(items_per_thread: int) -> Callable:
|
||||
|
||||
|
||||
def throughput_bench(state: nvbench.State) -> None:
|
||||
stride = state.getInt64("Stride")
|
||||
ipt = state.getInt64("ItemsPerThread")
|
||||
stride = state.get_int64("Stride")
|
||||
ipt = state.get_int64("ItemsPerThread")
|
||||
|
||||
nbytes = 128 * 1024 * 1024
|
||||
elements = nbytes // np.dtype(np.int32).itemsize
|
||||
|
||||
alloc_stream = as_cuda_Stream(state.getStream())
|
||||
alloc_stream = as_cuda_Stream(state.get_stream())
|
||||
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
|
||||
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
|
||||
|
||||
state.addElementCount(elements, column_name="Elements")
|
||||
state.addGlobalMemoryReads(inp_arr.nbytes, column_name="Datasize")
|
||||
state.addGlobalMemoryWrites(inp_arr.nbytes)
|
||||
state.add_element_count(elements, column_name="Elements")
|
||||
state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
|
||||
state.add_global_memory_writes(inp_arr.nbytes)
|
||||
|
||||
threads_per_block = 256
|
||||
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
|
||||
|
||||
krn = make_kernel(ipt)
|
||||
|
||||
# warm-up call ensures that kernel is loaded into context
|
||||
# before blocking kernel is launched
|
||||
krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
|
||||
stride, elements, inp_arr, out_arr
|
||||
)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
exec_stream = as_cuda_Stream(launch.getStream())
|
||||
exec_stream = as_cuda_Stream(launch.get_stream())
|
||||
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
|
||||
stride, elements, inp_arr, out_arr
|
||||
)
|
||||
@@ -77,7 +71,7 @@ def throughput_bench(state: nvbench.State) -> None:
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = nvbench.register(throughput_bench)
|
||||
b.addInt64Axis("Stride", [1, 2, 4])
|
||||
b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
|
||||
b.add_int64_axis("Stride", [1, 2, 4])
|
||||
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
|
||||
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
|
||||
@@ -14,18 +14,18 @@ def kernel(a, b, c):
|
||||
c[tid] = a[tid] + b[tid]
|
||||
|
||||
|
||||
def getNumbaStream(launch):
|
||||
return cuda.external_stream(launch.getStream().addressof())
|
||||
def get_numba_stream(launch):
|
||||
return cuda.external_stream(launch.get_stream().addressof())
|
||||
|
||||
|
||||
def add_two(state):
|
||||
# state.skip("Skipping this benchmark for no reason")
|
||||
N = state.getInt64("elements")
|
||||
N = state.get_int64("elements")
|
||||
a = cuda.to_device(np.random.random(N))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.addGlobalMemoryReads(a.nbytes)
|
||||
state.addGlobalMemoryWrites(c.nbytes)
|
||||
state.add_global_memory_reads(a.nbytes)
|
||||
state.add_global_memory_writes(c.nbytes)
|
||||
|
||||
nthreads = 256
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
@@ -35,22 +35,22 @@ def add_two(state):
|
||||
cuda.synchronize()
|
||||
|
||||
def kernel_launcher(launch):
|
||||
stream = getNumbaStream(launch)
|
||||
stream = get_numba_stream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, a, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
|
||||
|
||||
def add_float(state):
|
||||
N = state.getInt64("elements")
|
||||
v = state.getFloat64("v")
|
||||
name = state.getString("name")
|
||||
N = state.get_int64("elements")
|
||||
v = state.get_gloat64("v")
|
||||
name = state.get_string("name")
|
||||
a = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
b = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.addGlobalMemoryReads(a.nbytes + b.nbytes)
|
||||
state.addGlobalMemoryWrites(c.nbytes)
|
||||
state.add_global_memory_reads(a.nbytes + b.nbytes)
|
||||
state.add_global_memory_writes(c.nbytes)
|
||||
|
||||
nthreads = 64
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
@@ -58,26 +58,26 @@ def add_float(state):
|
||||
def kernel_launcher(launch):
|
||||
_ = v
|
||||
_ = name
|
||||
stream = getNumbaStream(launch)
|
||||
stream = get_numba_stream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, b, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
|
||||
|
||||
def add_three(state):
|
||||
N = state.getInt64("elements")
|
||||
N = state.get_int64("elements")
|
||||
a = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
b = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.addGlobalMemoryReads(a.nbytes + b.nbytes)
|
||||
state.addGlobalMemoryWrites(c.nbytes)
|
||||
state.add_global_memory_reads(a.nbytes + b.nbytes)
|
||||
state.add_global_memory_writes(c.nbytes)
|
||||
|
||||
nthreads = 256
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
def kernel_launcher(launch):
|
||||
stream = getNumbaStream(launch)
|
||||
stream = get_numba_stream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, b, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
@@ -86,18 +86,18 @@ def add_three(state):
|
||||
|
||||
def register_benchmarks():
|
||||
(
|
||||
nvbench.register(add_two).addInt64Axis(
|
||||
nvbench.register(add_two).add_int64_axis(
|
||||
"elements", [2**pow2 for pow2 in range(20, 23)]
|
||||
)
|
||||
)
|
||||
(
|
||||
nvbench.register(add_float)
|
||||
.addFloat64Axis("v", [0.1, 0.3])
|
||||
.addStringAxis("name", ["Anne", "Lynda"])
|
||||
.addInt64Axis("elements", [2**pow2 for pow2 in range(20, 23)])
|
||||
.add_float64_axis("v", [0.1, 0.3])
|
||||
.add_string_axis("name", ["Anne", "Lynda"])
|
||||
.add_int64_axis("elements", [2**pow2 for pow2 in range(20, 23)])
|
||||
)
|
||||
(
|
||||
nvbench.register(add_three).addInt64Axis(
|
||||
nvbench.register(add_three).add_int64_axis(
|
||||
"elements", [2**pow2 for pow2 in range(20, 22)]
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user