# Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import ctypes import functools import math from typing import Optional, Tuple, Union import cuda.bindings.driver as cuda import torch import cutlass import cutlass.cute as cute import cutlass.cute.testing as testing import cutlass.torch as cutlass_torch import cutlass.utils as utils from cutlass import Boolean, Float32, Int32, Int64 from cutlass.cute.runtime import make_ptr # Support both direct execution and module import try: from .reduce import row_reduce except ImportError: from reduce import row_reduce """ RMSNorm: Root Mean Square Layer Normalization for Hopper & Blackwell (SM90+) ==================================================================== A high-performance RMSNorm implementation using CuTe DSL with cluster-based reduction for large hidden dimensions. RMSNorm computes: y = x / sqrt(mean(x²) + eps) * weight Key Features: ------------- 1. CLUSTER SYNCHRONIZATION (SM90+) - Multiple CTAs cooperate to process large N dimensions - Each CTA handles N/cluster_n elements, then reduces across the cluster - Uses mbarrier for efficient cross-CTA synchronization 2. ARCHITECTURE-SPECIFIC TUNING - SM80 (Ampere): Single-CTA execution (cluster_n=1) - SM90 (Hopper): Cluster support enabled for large N - SM100 (Blackwell): Same as SM90 3. VECTORIZED MEMORY ACCESS - 128-bit vectorized loads/stores for optimal memory throughput - TiledCopy abstraction for organized gmem↔smem↔rmem transfers Cluster Size Selection (FP16): ------------------------------ - N <= 16K: cluster_n = 1 (single CTA) - N <= 32K: cluster_n = 2 - N <= 64K: cluster_n = 4 - N <= 128K: cluster_n = 8 - Larger: cluster_n = 16 To run this example: .. code-block:: bash python examples/python/CuTeDSL/blackwell/rmsnorm.py --M 2048 --N 4096 --dtype BFloat16 python examples/python/CuTeDSL/blackwell/rmsnorm.py --M 2048 --N 4096 --dtype BFloat16 --benchmark python examples/python/CuTeDSL/blackwell/rmsnorm.py --M 2048 --N 32768 --dtype BFloat16 --benchmark To collect performance with NCU profiler: .. code-block:: bash ncu python examples/python/CuTeDSL/blackwell/rmsnorm.py --M 2048 --N 4096 --dtype BFloat16 --skip_ref_check """ # ============================================================================= # Architecture Detection # ============================================================================= @functools.lru_cache(maxsize=16) def get_sm_version(device: Optional[Union[int, torch.device, str]] = None) -> int: """Get the SM (compute capability) version of a CUDA device.""" if not torch.cuda.is_available(): return 80 # Default fallback props = torch.cuda.get_device_properties(device) return props.major * 10 + props.minor def supports_cluster() -> bool: """Check if the current device supports cluster operations (SM90+).""" return get_sm_version() >= 90 # ============================================================================= # Predicate Utility # ============================================================================= @cute.jit def predicate_k(tXcX: cute.Tensor, limit: int) -> cute.Tensor: """Create predicate tensor for bounds checking.""" tXpX = cute.make_rmem_tensor( cute.make_layout( (cute.size(tXcX, mode=[0, 1]), cute.size(tXcX, mode=[1]), cute.size(tXcX, mode=[2])), stride=(cute.size(tXcX, mode=[2]), 0, 1), ), Boolean, ) for rest_v in cutlass.range_constexpr(tXpX.shape[0]): for rest_k in cutlass.range_constexpr(tXpX.shape[2]): tXpX[rest_v, 0, rest_k] = cute.elem_less(tXcX[(0, rest_v), 0, rest_k][1], limit) return tXpX # ============================================================================= # RMSNorm Configuration Class # ============================================================================= class RMSNormConfig: """ Configuration for the RMSNorm kernel. This class encapsulates all kernel configuration computed once at initialization, following CuTe-DSL conventions from official CUTLASS examples. """ COPY_BITS = 128 # 128-bit vectorized loads def __init__( self, dtype: type[cutlass.Numeric], N: int, has_weight: bool = True, sm_version: int | None = None, ): self.dtype = dtype self.N = N self.has_weight = has_weight self.sm_version = sm_version if sm_version is not None else get_sm_version() # Vector size for 128-bit loads self.vec_size = self.COPY_BITS // dtype.width # Compute cluster size (SM90+ only) self.cluster_n = self._compute_cluster_n(N, dtype, self.sm_version) # N per CTA for cluster case self.N_per_cta = N // self.cluster_n # Thread configuration using static methods self.threads_per_row = self._compute_threads_per_row(self.N_per_cta) self.num_threads = self._compute_num_threads(self.N_per_cta) # Derived values self.num_vec_blocks = max( 1, (self.N_per_cta // self.vec_size + self.threads_per_row - 1) // self.threads_per_row ) self.rows_per_block = self.num_threads // self.threads_per_row self.cols_per_tile = self.vec_size * self.num_vec_blocks * self.threads_per_row self.warps_per_row = max(self.threads_per_row // 32, 1) @staticmethod def _compute_cluster_n(N: int, dtype: type[cutlass.Numeric], sm_version: int) -> int: """Compute optimal cluster size based on N and architecture.""" if sm_version < 90: return 1 if dtype.width == 16: # FP16/BF16 if N <= 16 * 1024: return 1 elif N <= 32 * 1024: return 2 elif N <= 64 * 1024: return 4 elif N <= 128 * 1024: return 8 else: return 16 else: # FP32 if N <= 32 * 1024: return 1 elif N <= 64 * 1024: return 2 elif N <= 128 * 1024: return 4 elif N <= 256 * 1024: return 8 else: return 16 @staticmethod def _compute_threads_per_row(N_per_cta: int) -> int: """Compute optimal threads per row based on N per CTA.""" if N_per_cta <= 64: return 8 elif N_per_cta <= 128: return 16 elif N_per_cta <= 3072: return 32 elif N_per_cta <= 6144: return 64 elif N_per_cta <= 16384: return 128 else: return 256 @staticmethod def _compute_num_threads(N_per_cta: int) -> int: """Compute total threads per block.""" return 128 if N_per_cta <= 16384 else 256 @staticmethod def _make_tv_layout( threads_per_row: int, rows_per_block: int, vec_size: int, num_vec_blocks: int, ) -> tuple: """Create Thread-Value layout for coalesced vectorized memory access.""" shape = ( (threads_per_row, rows_per_block), (vec_size, num_vec_blocks), ) stride = ( (vec_size * rows_per_block, 1), (rows_per_block, rows_per_block * vec_size * threads_per_row), ) return shape, stride def smem_size_in_bytes(self) -> int: """Calculate shared memory requirement in bytes.""" tile_bytes = self.rows_per_block * self.cols_per_tile * (self.dtype.width // 8) reduction_bytes = self.rows_per_block * self.warps_per_row * self.cluster_n * 4 mbar_bytes = 8 if self.cluster_n > 1 else 0 return tile_bytes + reduction_bytes + mbar_bytes # ============================================================================= # RMSNorm Kernel Class # ============================================================================= class RMSNormKernel: """ RMSNorm kernel with cluster synchronization for large N. Features: - Cluster-based reduction for large N (SM90+) - Multiple CTAs cooperate via mbarrier - Single reduction (sum of squares) with cluster-level aggregation Example: >>> kernel = RMSNormKernel(cutlass.Float16, N=4096) >>> kernel(x_ptr, w_ptr, o_ptr, M, eps, stream) """ def __init__( self, dtype: cutlass.Numeric, N: int, has_weight: bool = True, config: RMSNormConfig | None = None, ): # Use provided config or create new one if config is not None: self.cfg = config else: self.cfg = RMSNormConfig(dtype, N, has_weight) # Expose key attributes for convenience self.dtype = self.cfg.dtype self.N = self.cfg.N self.has_weight = self.cfg.has_weight self.cluster_n = self.cfg.cluster_n @cute.jit def __call__( self, x_ptr: cute.Pointer, w_ptr: cute.Pointer | None, o_ptr: cute.Pointer, M: Int32, eps: Float32, stream: cuda.CUstream, ): """Host function to launch the RMSNorm kernel.""" cfg = self.cfg # Create CuTe tensors from raw pointers mX = cute.make_tensor( x_ptr, cute.make_layout((M, cfg.N), stride=(cfg.N, 1)), ) mO = cute.make_tensor( o_ptr, cute.make_layout((M, cfg.N), stride=(cfg.N, 1)), ) if cutlass.const_expr(cfg.has_weight and w_ptr is not None): mW = cute.make_tensor( w_ptr, cute.make_layout((cfg.N,), stride=(1,)), ) else: mW = None # Create TV layout using static helper tv_shape, tv_stride = RMSNormConfig._make_tv_layout( cfg.threads_per_row, cfg.rows_per_block, cfg.vec_size, cfg.num_vec_blocks, ) tv_layout = cute.make_layout(tv_shape, stride=tv_stride) tiler_mn = (cfg.rows_per_block, cfg.cols_per_tile) self.kernel(mX, mW, mO, eps, tv_layout, tiler_mn).launch( grid=[cute.ceil_div(M, cfg.rows_per_block), cfg.cluster_n, 1], block=[cfg.num_threads, 1, 1], cluster=[1, cfg.cluster_n, 1] if cutlass.const_expr(cfg.cluster_n > 1) else None, smem=cfg.smem_size_in_bytes(), stream=stream, ) @cute.kernel def kernel( self, mX: cute.Tensor, mW: cute.Tensor | None, mO: cute.Tensor, eps: Float32, tv_layout: cute.Layout, tiler_mn: cute.Shape, ): """Device kernel implementing RMSNorm with cluster support.""" cfg = self.cfg tidx, _, _ = cute.arch.thread_idx() bidx, _, _ = cute.arch.block_idx() if cutlass.const_expr(cfg.cluster_n > 1): cluster_y = cute.arch.block_idx()[1] else: cluster_y = cutlass.const_expr(0) M = mX.shape[0] threads_per_row = tv_layout.shape[0][0] warps_per_row = max(threads_per_row // 32, 1) rows_per_block = tiler_mn[0] # ===================================================================== # Allocate shared memory # ===================================================================== smem = utils.SmemAllocator() sX = smem.allocate_tensor( mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16, ) if cutlass.const_expr(cfg.cluster_n == 1): reduction_buffer = smem.allocate_tensor( Float32, cute.make_layout((rows_per_block, warps_per_row)), byte_alignment=4, ) mbar_ptr = None else: reduction_buffer = smem.allocate_tensor( Float32, cute.make_layout((rows_per_block, (warps_per_row, cfg.cluster_n))), byte_alignment=4, ) mbar_ptr = smem.allocate_array(Int64, num_elems=1) # ===================================================================== # Initialize cluster # ===================================================================== if cutlass.const_expr(cfg.cluster_n > 1): if tidx == 0: cute.arch.mbarrier_init(mbar_ptr, 1) cute.arch.mbarrier_init_fence() cute.arch.cluster_arrive_relaxed() cute.arch.cluster_wait() # ===================================================================== # Create identity tensor and partition # ===================================================================== idX = cute.make_identity_tensor(mX.shape) gX = cute.local_tile(mX, tiler_mn, (bidx, cluster_y)) gO = cute.local_tile(mO, tiler_mn, (bidx, cluster_y)) cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y)) if cutlass.const_expr(cfg.has_weight and mW is not None): mW_expanded_layout = cute.prepend( mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)) ) mW_2d = cute.make_tensor(mW.iterator, mW_expanded_layout) gW = cute.local_tile(mW_2d, tiler_mn, (0, cluster_y)) # ===================================================================== # Create TiledCopy operations # ===================================================================== copy_atom_load_async = cute.make_copy_atom( cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=RMSNormConfig.COPY_BITS, ) copy_atom_load_W = cute.make_copy_atom( cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=RMSNormConfig.COPY_BITS, ) copy_atom_store = cute.make_copy_atom( cute.nvgpu.CopyUniversalOp(), mO.element_type, num_bits_per_copy=RMSNormConfig.COPY_BITS, ) tiled_copy_load = cute.make_tiled_copy(copy_atom_load_async, tv_layout, tiler_mn) tiled_copy_W = cute.make_tiled_copy(copy_atom_load_W, tv_layout, tiler_mn) tiled_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn) thr_copy_X = tiled_copy_load.get_slice(tidx) thr_copy_W = tiled_copy_W.get_slice(tidx) thr_copy_O = tiled_copy_store.get_slice(tidx) # Partition tensors tXgX = thr_copy_X.partition_S(gX) tXsX = thr_copy_X.partition_D(sX) tXgO = thr_copy_O.partition_D(gO) tXcX = thr_copy_X.partition_S(cX) # Register fragments tXrX = cute.make_fragment_like(tXgX) tXrO = cute.make_fragment_like(tXgO) if cutlass.const_expr(cfg.has_weight and mW is not None): tWgW = thr_copy_W.partition_S(gW) tWrW = cute.make_fragment_like(tWgW) tXrW = thr_copy_X.retile(tWrW) # ===================================================================== # Bounds checking # ===================================================================== tXpX = predicate_k(tXcX, limit=cfg.N) row_coord = tXcX[(0, 0), 0, 0] row_in_bounds = row_coord[0] < M # ===================================================================== # Async copy global → shared # ===================================================================== if row_in_bounds: cute.copy(copy_atom_load_async, tXgX, tXsX, pred=tXpX) cute.arch.cp_async_commit_group() # Load weight while waiting if cutlass.const_expr(cfg.has_weight and mW is not None): tWpW = predicate_k(thr_copy_W.partition_S(cX), limit=cfg.N) cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW) cute.arch.cp_async_wait_group(0) # ===================================================================== # Pass 1: Compute sum of squares with cluster reduction # ===================================================================== cute.autovec_copy(tXsX, tXrX) x = tXrX.load().to(Float32) x_sq = x * x sum_sq = row_reduce( x_sq, cute.ReductionOp.ADD, threads_per_row, reduction_buffer, mbar_ptr, cfg.cluster_n, Float32(0.0), ) # rstd = 1 / sqrt(mean(x²) + eps) mean_sq = sum_sq / cfg.N rstd = cute.math.rsqrt(mean_sq + eps, fastmath=True) # Sync after reduction if cutlass.const_expr(cfg.cluster_n > 1): cute.arch.cluster_arrive_relaxed() cute.arch.cluster_wait() else: cute.arch.barrier() # ===================================================================== # Pass 2: Normalize and output # ===================================================================== cute.autovec_copy(tXsX, tXrX) x = tXrX.load().to(Float32) y = x * rstd # Apply weight if present if cutlass.const_expr(cfg.has_weight and mW is not None): w = tXrW.load().to(Float32) y = y * w # Store to global memory tXrO.store(y.to(cfg.dtype)) if row_in_bounds: cute.copy(copy_atom_store, tXrO, tXgO, pred=tXpX) # ============================================================================= # Kernel Compilation and Caching # ============================================================================= # Mapping from torch dtype to cutlass dtype _torch_to_cutlass_dtype = { torch.float16: cutlass.Float16, torch.bfloat16: cutlass.BFloat16, torch.float32: cutlass.Float32, } # Cache for compiled kernels _compile_cache: dict = {} def get_compiled_kernel( dtype: type[cutlass.Numeric], N: int, has_weight: bool, stream: cuda.CUstream, ): """ Get or compile the RMSNorm kernel for the given configuration. Uses compilation cache to avoid recompiling for same (dtype, N, has_weight) tuples. :param dtype: Data type (Float16, BFloat16, Float32) :type dtype: type[cutlass.Numeric] :param N: Hidden dimension size :type N: int :param has_weight: Whether weight is applied :type has_weight: bool :param stream: CUDA stream :type stream: cuda.CUstream :return: Compiled kernel function """ key = (dtype, N, has_weight) if key not in _compile_cache: kernel_obj = RMSNormKernel(dtype, N, has_weight) # Compile with representative arguments compiled_kernel = cute.compile( kernel_obj, make_ptr(dtype, 16, cute.AddressSpace.gmem, assumed_align=16), # x_ptr make_ptr(dtype, 16, cute.AddressSpace.gmem, assumed_align=16) if has_weight else None, # w_ptr make_ptr(dtype, 16, cute.AddressSpace.gmem, assumed_align=16), # o_ptr Int32(1), # M (dummy) Float32(1e-6), # eps (dummy) stream, ) _compile_cache[key] = compiled_kernel return _compile_cache[key] # ============================================================================= # Tensor Creation Utilities # ============================================================================= def create_tensors( M: int, N: int, dtype: type[cutlass.Numeric], has_weight: bool, ) -> Tuple: """Create input, weight, and output tensors for RMSNorm.""" torch.manual_seed(42) torch_dtype = cutlass_torch.dtype(dtype) x = torch.randn(M, N, device="cuda", dtype=torch_dtype) weight = torch.randn(N, device="cuda", dtype=torch_dtype) if has_weight else None out = torch.empty_like(x) return x, weight, out def rmsnorm_ref( x: torch.Tensor, weight: torch.Tensor | None = None, eps: float = 1e-6, ) -> torch.Tensor: """Reference RMSNorm implementation in PyTorch.""" x_f32 = x.float() rms = torch.sqrt(torch.mean(x_f32**2, dim=-1, keepdim=True) + eps) x_norm = x_f32 / rms if weight is not None: x_norm = x_norm * weight.float() return x_norm.to(x.dtype) # ============================================================================= # Run Function # ============================================================================= def run( M: int, N: int, dtype: type[cutlass.Numeric], has_weight: bool = True, eps: float = 1e-6, tolerance: float = 1e-2, warmup_iterations: int = 2, iterations: int = 100, skip_ref_check: bool = False, benchmark: bool = False, ) -> float: """ Execute RMSNorm and optionally benchmark performance. :param M: Number of rows (batch size * sequence length) :type M: int :param N: Hidden dimension size :type N: int :param dtype: Data type (Float16, BFloat16, Float32) :type dtype: type[cutlass.Numeric] :param has_weight: Whether to apply learnable weight :type has_weight: bool :param eps: Epsilon for numerical stability :type eps: float :param tolerance: Tolerance for correctness check :type tolerance: float :param warmup_iterations: Warmup iterations for benchmarking :type warmup_iterations: int :param iterations: Number of benchmark iterations :type iterations: int :param skip_ref_check: Skip reference correctness check :type skip_ref_check: bool :param benchmark: Enable benchmarking :type benchmark: bool :return: Execution time in microseconds (if benchmark=True, else 0) :rtype: float """ print("Running RMSNorm test with:") print(f" M: {M}, N: {N}") print(f" dtype: {dtype}") print(f" has_weight: {has_weight}") print(f" eps: {eps}") print(f" SM version: {get_sm_version()}") if not torch.cuda.is_available(): raise RuntimeError("CUDA GPU is required to run this example!") # Get CUDA stream torch_stream = torch.cuda.current_stream() stream = cuda.CUstream(torch_stream.cuda_stream) # Create tensors x, weight, out = create_tensors(M, N, dtype, has_weight) # Get configuration info config = RMSNormConfig(dtype, N, has_weight) print(f" cluster_n: {config.cluster_n}") print(f" threads_per_row: {config.threads_per_row}") print(f" rows_per_block: {config.rows_per_block}") # Get compiled kernel compiled_kernel = get_compiled_kernel(dtype, N, has_weight, stream) # Create pointers for kernel call x_ptr = make_ptr(dtype, x.data_ptr()) w_ptr = make_ptr(dtype, weight.data_ptr()) if weight is not None else None out_ptr = make_ptr(dtype, out.data_ptr()) # Run kernel and verify if not skip_ref_check: compiled_kernel(x_ptr, w_ptr, out_ptr, Int32(M), Float32(eps), stream) torch.cuda.synchronize() ref = rmsnorm_ref(x, weight, eps) torch.testing.assert_close(out, ref, atol=tolerance, rtol=tolerance) print("Correctness check passed!") if not benchmark: return 0.0 # Benchmark print(f"\nBenchmarking with {warmup_iterations} warmup, {iterations} iterations...") def generate_tensors(): x, weight, out = create_tensors(M, N, dtype, has_weight) x_ptr = make_ptr(dtype, x.data_ptr()) w_ptr = make_ptr(dtype, weight.data_ptr()) if weight is not None else None out_ptr = make_ptr(dtype, out.data_ptr()) return testing.JitArguments(x_ptr, w_ptr, out_ptr, Int32(M), Float32(eps), stream) exec_time_us = testing.benchmark( compiled_kernel, workspace_generator=generate_tensors, workspace_count=10, warmup_iterations=warmup_iterations, iterations=iterations, stream=stream, ) # Calculate throughput torch_dtype = cutlass_torch.dtype(dtype) bytes_per_elem = torch.tensor([], dtype=torch_dtype).element_size() total_bytes = M * N * bytes_per_elem * 2 # read x, write out if has_weight: total_bytes += N * bytes_per_elem # read weight (amortized across M) throughput_gbps = (total_bytes / (exec_time_us / 1e6)) / 1e9 print(f"Kernel execution time: {exec_time_us:.2f} us") print(f"Memory throughput: {throughput_gbps:.2f} GB/s") return exec_time_us # ============================================================================= # Main Entry Point # ============================================================================= if __name__ == "__main__": parser = argparse.ArgumentParser( description="RMSNorm kernel example for Blackwell (SM100)" ) parser.add_argument("--M", type=int, default=2048, help="Number of rows") parser.add_argument("--N", type=int, default=4096, help="Hidden dimension size") parser.add_argument( "--dtype", type=cutlass.dtype, default=cutlass.BFloat16, help="Data type (Float16, BFloat16, Float32)", ) parser.add_argument( "--has_weight", action="store_true", default=True, help="Apply learnable weight", ) parser.add_argument( "--no_weight", action="store_true", help="Disable learnable weight", ) parser.add_argument( "--eps", type=float, default=1e-6, help="Epsilon for numerical stability", ) parser.add_argument( "--tolerance", type=float, default=1e-2, help="Tolerance for correctness check", ) parser.add_argument( "--warmup_iterations", type=int, default=2, help="Warmup iterations for benchmarking", ) parser.add_argument( "--iterations", type=int, default=100, help="Number of benchmark iterations", ) parser.add_argument( "--skip_ref_check", action="store_true", help="Skip reference correctness check", ) parser.add_argument( "--benchmark", action="store_true", help="Enable benchmarking", ) args = parser.parse_args() # Handle weight flag has_weight = args.has_weight and not args.no_weight run( M=args.M, N=args.N, dtype=args.dtype, has_weight=has_weight, eps=args.eps, tolerance=args.tolerance, warmup_iterations=args.warmup_iterations, iterations=args.iterations, skip_ref_check=args.skip_ref_check, benchmark=args.benchmark, ) print("PASS")