v4.4.1 update (#3079)

2026-04-20 06:48:59 +00:00 · 2026-02-28 02:59:21 +08:00
parent c651d660d2
commit 3bb6e28d3c
13 changed files with 92 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@

 # CUTLASS 4.x

+## [4.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.1) (2026-02-27)
+
+### CuTe DSL
+* Bug fixing and improvements
+  - Fixed a segfault issue with tvm-ffi on aarch64
+
 ## [4.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.0) (2026-02-14)

 ### CuTe DSL
@@ -139,7 +145,7 @@
  - Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py

 ### CUTLASS C++
-* Work around a driver bug which will cause occasionally errors when executing kernels.
+* Work around a driver TMA descriptor related bug which will cause occasional errors on Blackwell when the tensor's backing memory allocation is less than 128KB and it is not a dense non-overlapping tensor.

 ## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)

--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 # Overview

-# CUTLASS 4.4.0
+# CUTLASS 4.4.1

-_CUTLASS 4.4.0 - Feb 2026_
+_CUTLASS 4.4.1 - Feb 2026_

 CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
 and related computations at all levels and scales within CUDA. It incorporates strategies for
@@ -84,6 +84,7 @@ To get started quickly - please refer :
  - Fixed `cute.printf` with f-string
  - Fixed an indexing issue of scalar tensor
  - Fixed small K reference check error for cta_tile_n = 256 case with overlapping accumulator optimization in [Blackwell SM100 persistent dense blockscaled GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py).
+  - Fixed a segfault issue with tvm-ffi on aarch64

 * API changes
  - Deprecate get_num_tmem_alloc_cols from blackwell_helpers.py. Use the one from tmem_allocator.py instead.
--- a/examples/python/CuTeDSL/hopper/cta_norm.py
+++ b/examples/python/CuTeDSL/hopper/cta_norm.py
@@ -170,10 +170,10 @@ class CtaNorm:
        print(f"[DSL INFO]   pred = {pred.type}")
        for i in range(cute.size(tXrX, mode=[1])):
            if pred[i]:
-                cute.autovec_copy(tXgX[None, i], tXrX[None, i])  # LDG.128
-                cute.autovec_copy(tWgW[None, i], tWrW[None, i])  # LDG.128
+                cute.autovec_copy(tXgX[None, i], tXrX[None, i])  # Global load
+                cute.autovec_copy(tWgW[None, i], tWrW[None, i])  # Global load
                if cutlass.const_expr(self.norm_type == "layer"):
-                    cute.autovec_copy(tBgB[None, i], tBrB[None, i])  # LDG.128
+                    cute.autovec_copy(tBgB[None, i], tBrB[None, i])  # Global load
        if cutlass.const_expr(self.norm_type == "layer"):
            tYrY = self.apply_layernorm(tXrX, tWrW, tBrB, eps, tidx, pred)
        elif cutlass.const_expr(self.norm_type == "rms"):
@@ -421,4 +421,4 @@ if __name__ == "__main__":
        warmup_iterations=args.warmup_iterations,
        iterations=args.iterations,
    )
-    print("\nPASS")
+    print("\nPASS")
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -212,7 +212,8 @@ struct CollectiveMma<

  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
             "ElementAccumulator and ElementBlockScale should be same datatype");
-  using NumSplitsM = cute::C<get<0>(TileShape_{}) / 128>;
+  // For TileShapeM < 128, NumSplitsM should be 1
+  using NumSplitsM = cute::conditional_t<get<0>(TileShape_{}) < _128{}, _1, cute::C<get<0>(TileShape_{}) / 128>>;
  static_assert(NumSplitsM{} == 1 || NumSplitsM{} == 2);

  struct SharedStorage {
--- a/include/cutlass/version.h
+++ b/include/cutlass/version.h
@@ -36,7 +36,7 @@

 #define CUTLASS_MAJOR 4
 #define CUTLASS_MINOR 4
-#define CUTLASS_PATCH 0
+#define CUTLASS_PATCH 1

 #ifdef CUTLASS_VERSIONS_GENERATED
 #include "cutlass/version_extended.h"
--- a/python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py
+++ b/python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py
@@ -371,6 +371,64 @@ class MLIRBuilder(MLIRTypeBuilder):
            self.const_str_table[content] = symbol
        return symbol

+    def get_or_load_global_func_ptr_from_text(
+        self,
+        current_block: ir.Block,
+        function_name: str,
+    ) -> ir.Value:
+        """Get or create a function pointer global in .text section and load it.
+
+        This creates a constant global function pointer in the .text section
+        (for AArch64 ADRP range compatibility) and performs a volatile load
+        to prevent optimization.
+
+        This forces the function pointer to be local to the code, bypassing GOT entry
+        ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
+        apart which can happen when ASLR is applied.
+        """
+        # Check if we've already created this global
+        if function_name not in self.const_func_ptr_table:
+            symbol = f"__func_ptr_{function_name}"
+
+            module_body = self.module.body
+            with ir.InsertionPoint(module_body):
+                # 1. Create the global constant
+                # We use 'private' linkage so it doesn't conflict across modules
+                global_ptr = llvm.GlobalOp(
+                    self.ptr_type,
+                    symbol,
+                    ir.Attribute.parse("#llvm.linkage<private>"),
+                    # Initialization via block below
+                )
+
+                # 2. Set the necessary attributes for JIT safety and AArch64 range
+                # We use 'constant' to mark it as immutable
+                # We use 'section = ".text"' to force it into the code block
+                global_ptr.attributes["constant"] = ir.UnitAttr.get()
+                global_ptr.attributes["section"] = ir.StringAttr.get(".text")
+
+                # 3. Add a constructor block to the GlobalOp to initialize it
+                # with the address of the target function
+                initializer_block = global_ptr.initializer.blocks.append()
+                with ir.InsertionPoint(initializer_block):
+                    # Get the address of the external function
+                    func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
+                    # Return the address as the initial value of the global
+                    llvm.return_(arg=func_addr)
+
+                self.const_func_ptr_table[function_name] = symbol
+        else:
+            symbol = self.const_func_ptr_table[function_name]
+
+        # Load it with volatile semantics in the current block
+        with ir.InsertionPoint(current_block):
+            symbol_addr = self.address_of(symbol, self.ptr_type)
+            # Perform a volatile load to prevent optimization
+            load_op = llvm.load(self.ptr_type, symbol_addr)
+            # Set volatile attribute to prevent optimization
+            load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
+            return load_op
+

    # function
    def function(
--- a/python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py
+++ b/python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py
@@ -129,13 +129,16 @@ class TVMFFICuteCallProvider(DynamicParamPackCallProvider):
            cuda_global_state_ptr = self.address_of(
                self.cuda_global_state_symbol, self.ptr_type
            )
-            cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
-            cuda_load_to_device_ptr = self.address_of(
-                "cuda_load_to_device", self.ptr_type
-            )
-            set_error_ptr = self.address_of(
-                "TVMFFIErrorSetRaisedFromCStr", self.ptr_type
-            )
+
+        cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "cuda_init"
+        )
+        cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "cuda_load_to_device"
+        )
+        set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "TVMFFIErrorSetRaisedFromCStr"
+        )

        with ir.InsertionPoint(current_block):
            # Call the callback function with the loaded ptr value
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
    """TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""

    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
        # initialize the tvm_ffi.Function from the current execution engine
        if self.__chandle__() != 0:
            raise DSLRuntimeError("TVM FFI function is already initialized")
--- a/python/CuTeDSL/requirements-cu13.txt
+++ b/python/CuTeDSL/requirements-cu13.txt
@@ -1,3 +1,3 @@
 # Use `pip install -r requirements-cu13.txt` with the present file to install a
 # wheel consistent with the present state of the github repository
-nvidia-cutlass-dsl[cu13]==4.4.0
+nvidia-cutlass-dsl[cu13]==4.4.1
--- a/python/CuTeDSL/requirements.txt
+++ b/python/CuTeDSL/requirements.txt
@@ -1,3 +1,3 @@
 # Use `pip install -r requirements.txt` with the present file to install a
 # wheel consistent with the present state of the github repository
-nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl==4.4.1
--- a/python/cutlass_cppgen/init.py
+++ b/python/cutlass_cppgen/init.py
@@ -133,7 +133,7 @@ def get_option_registry():
        this._option_registry = OptionRegistry(device_cc())
    return this._option_registry

-this.__version__ = '4.4.0'
+this.__version__ = '4.4.1'

 from cutlass_cppgen.backend import create_memory_pool
 from cutlass_cppgen.emit.pytorch import pytorch
--- a/python/setup_cutlass.py
+++ b/python/setup_cutlass.py
@@ -51,7 +51,7 @@ setup_pycute.perform_setup()

 setup(
    name='cutlass_cppgen',
-    version='4.4.0',
+    version='4.4.1',
    description='CUTLASS Pythonic Interface',
    package_dir={'': '.'},
    packages=[
--- a/python/setup_library.py
+++ b/python/setup_library.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
    setup(
        name='cutlass_library',
-        version='4.4.0',
+        version='4.4.1',
        description='CUTLASS library generation scripts',
        packages=['cutlass_library']
    )
--- a/python/setup_pycute.py
+++ b/python/setup_pycute.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
    setup(
        name='pycute',
-        version='4.4.0',
+        version='4.4.1',
        description='Python implementation of CuTe',
        packages=['pycute'],
    )