v4.4.1 update (#3079)

2026-07-19 01:57:01 +00:00 · 2026-02-28 02:59:21 +08:00
parent c651d660d2
commit 3bb6e28d3c
13 changed files with 92 additions and 23 deletions
--- a/python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py
+++ b/python/CuTeDSL/cutlass/base_dsl/tvm_ffi_builder/mlir_builder.py
@@ -371,6 +371,64 @@ class MLIRBuilder(MLIRTypeBuilder):
            self.const_str_table[content] = symbol
        return symbol

+    def get_or_load_global_func_ptr_from_text(
+        self,
+        current_block: ir.Block,
+        function_name: str,
+    ) -> ir.Value:
+        """Get or create a function pointer global in .text section and load it.
+
+        This creates a constant global function pointer in the .text section
+        (for AArch64 ADRP range compatibility) and performs a volatile load
+        to prevent optimization.
+
+        This forces the function pointer to be local to the code, bypassing GOT entry
+        ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
+        apart which can happen when ASLR is applied.
+        """
+        # Check if we've already created this global
+        if function_name not in self.const_func_ptr_table:
+            symbol = f"__func_ptr_{function_name}"
+
+            module_body = self.module.body
+            with ir.InsertionPoint(module_body):
+                # 1. Create the global constant
+                # We use 'private' linkage so it doesn't conflict across modules
+                global_ptr = llvm.GlobalOp(
+                    self.ptr_type,
+                    symbol,
+                    ir.Attribute.parse("#llvm.linkage<private>"),
+                    # Initialization via block below
+                )
+
+                # 2. Set the necessary attributes for JIT safety and AArch64 range
+                # We use 'constant' to mark it as immutable
+                # We use 'section = ".text"' to force it into the code block
+                global_ptr.attributes["constant"] = ir.UnitAttr.get()
+                global_ptr.attributes["section"] = ir.StringAttr.get(".text")
+
+                # 3. Add a constructor block to the GlobalOp to initialize it
+                # with the address of the target function
+                initializer_block = global_ptr.initializer.blocks.append()
+                with ir.InsertionPoint(initializer_block):
+                    # Get the address of the external function
+                    func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
+                    # Return the address as the initial value of the global
+                    llvm.return_(arg=func_addr)
+
+                self.const_func_ptr_table[function_name] = symbol
+        else:
+            symbol = self.const_func_ptr_table[function_name]
+
+        # Load it with volatile semantics in the current block
+        with ir.InsertionPoint(current_block):
+            symbol_addr = self.address_of(symbol, self.ptr_type)
+            # Perform a volatile load to prevent optimization
+            load_op = llvm.load(self.ptr_type, symbol_addr)
+            # Set volatile attribute to prevent optimization
+            load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
+            return load_op
+

    # function
    def function(
--- a/python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py
+++ b/python/CuTeDSL/cutlass/cutlass_dsl/tvm_ffi_provider.py
@@ -129,13 +129,16 @@ class TVMFFICuteCallProvider(DynamicParamPackCallProvider):
            cuda_global_state_ptr = self.address_of(
                self.cuda_global_state_symbol, self.ptr_type
            )
-            cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
-            cuda_load_to_device_ptr = self.address_of(
-                "cuda_load_to_device", self.ptr_type
-            )
-            set_error_ptr = self.address_of(
-                "TVMFFIErrorSetRaisedFromCStr", self.ptr_type
-            )
+
+        cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "cuda_init"
+        )
+        cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "cuda_load_to_device"
+        )
+        set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
+            current_block, "TVMFFIErrorSetRaisedFromCStr"
+        )

        with ir.InsertionPoint(current_block):
            # Call the callback function with the loaded ptr value
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
    """TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""

    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
        # initialize the tvm_ffi.Function from the current execution engine
        if self.__chandle__() != 0:
            raise DSLRuntimeError("TVM FFI function is already initialized")
--- a/python/CuTeDSL/requirements-cu13.txt
+++ b/python/CuTeDSL/requirements-cu13.txt
@@ -1,3 +1,3 @@
 # Use `pip install -r requirements-cu13.txt` with the present file to install a
 # wheel consistent with the present state of the github repository
-nvidia-cutlass-dsl[cu13]==4.4.0
+nvidia-cutlass-dsl[cu13]==4.4.1
--- a/python/CuTeDSL/requirements.txt
+++ b/python/CuTeDSL/requirements.txt
@@ -1,3 +1,3 @@
 # Use `pip install -r requirements.txt` with the present file to install a
 # wheel consistent with the present state of the github repository
-nvidia-cutlass-dsl==4.4.0
+nvidia-cutlass-dsl==4.4.1
--- a/python/cutlass_cppgen/init.py
+++ b/python/cutlass_cppgen/init.py
@@ -133,7 +133,7 @@ def get_option_registry():
        this._option_registry = OptionRegistry(device_cc())
    return this._option_registry

-this.__version__ = '4.4.0'
+this.__version__ = '4.4.1'

 from cutlass_cppgen.backend import create_memory_pool
 from cutlass_cppgen.emit.pytorch import pytorch
--- a/python/setup_cutlass.py
+++ b/python/setup_cutlass.py
@@ -51,7 +51,7 @@ setup_pycute.perform_setup()

 setup(
    name='cutlass_cppgen',
-    version='4.4.0',
+    version='4.4.1',
    description='CUTLASS Pythonic Interface',
    package_dir={'': '.'},
    packages=[
--- a/python/setup_library.py
+++ b/python/setup_library.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
    setup(
        name='cutlass_library',
-        version='4.4.0',
+        version='4.4.1',
        description='CUTLASS library generation scripts',
        packages=['cutlass_library']
    )
--- a/python/setup_pycute.py
+++ b/python/setup_pycute.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
    setup(
        name='pycute',
-        version='4.4.0',
+        version='4.4.1',
        description='Python implementation of CuTe',
        packages=['pycute'],
    )