v4.4.1 update (#3079)

This commit is contained in:
Junkai-Wu
2026-02-28 02:59:21 +08:00
committed by GitHub
parent c651d660d2
commit 3bb6e28d3c
13 changed files with 92 additions and 23 deletions

View File

@@ -371,6 +371,64 @@ class MLIRBuilder(MLIRTypeBuilder):
self.const_str_table[content] = symbol
return symbol
def get_or_load_global_func_ptr_from_text(
self,
current_block: ir.Block,
function_name: str,
) -> ir.Value:
"""Get or create a function pointer global in .text section and load it.
This creates a constant global function pointer in the .text section
(for AArch64 ADRP range compatibility) and performs a volatile load
to prevent optimization.
This forces the function pointer to be local to the code, bypassing GOT entry
ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
apart which can happen when ASLR is applied.
"""
# Check if we've already created this global
if function_name not in self.const_func_ptr_table:
symbol = f"__func_ptr_{function_name}"
module_body = self.module.body
with ir.InsertionPoint(module_body):
# 1. Create the global constant
# We use 'private' linkage so it doesn't conflict across modules
global_ptr = llvm.GlobalOp(
self.ptr_type,
symbol,
ir.Attribute.parse("#llvm.linkage<private>"),
# Initialization via block below
)
# 2. Set the necessary attributes for JIT safety and AArch64 range
# We use 'constant' to mark it as immutable
# We use 'section = ".text"' to force it into the code block
global_ptr.attributes["constant"] = ir.UnitAttr.get()
global_ptr.attributes["section"] = ir.StringAttr.get(".text")
# 3. Add a constructor block to the GlobalOp to initialize it
# with the address of the target function
initializer_block = global_ptr.initializer.blocks.append()
with ir.InsertionPoint(initializer_block):
# Get the address of the external function
func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
# Return the address as the initial value of the global
llvm.return_(arg=func_addr)
self.const_func_ptr_table[function_name] = symbol
else:
symbol = self.const_func_ptr_table[function_name]
# Load it with volatile semantics in the current block
with ir.InsertionPoint(current_block):
symbol_addr = self.address_of(symbol, self.ptr_type)
# Perform a volatile load to prevent optimization
load_op = llvm.load(self.ptr_type, symbol_addr)
# Set volatile attribute to prevent optimization
load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
return load_op
# function
def function(

View File

@@ -129,13 +129,16 @@ class TVMFFICuteCallProvider(DynamicParamPackCallProvider):
cuda_global_state_ptr = self.address_of(
self.cuda_global_state_symbol, self.ptr_type
)
cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
cuda_load_to_device_ptr = self.address_of(
"cuda_load_to_device", self.ptr_type
)
set_error_ptr = self.address_of(
"TVMFFIErrorSetRaisedFromCStr", self.ptr_type
)
cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
current_block, "cuda_init"
)
cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
current_block, "cuda_load_to_device"
)
set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
current_block, "TVMFFIErrorSetRaisedFromCStr"
)
with ir.InsertionPoint(current_block):
# Call the callback function with the loaded ptr value
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
"""TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
# initialize the tvm_ffi.Function from the current execution engine
if self.__chandle__() != 0:
raise DSLRuntimeError("TVM FFI function is already initialized")

View File

@@ -1,3 +1,3 @@
# Use `pip install -r requirements-cu13.txt` with the present file to install a
# wheel consistent with the present state of the github repository
nvidia-cutlass-dsl[cu13]==4.4.0
nvidia-cutlass-dsl[cu13]==4.4.1

View File

@@ -1,3 +1,3 @@
# Use `pip install -r requirements.txt` with the present file to install a
# wheel consistent with the present state of the github repository
nvidia-cutlass-dsl==4.4.0
nvidia-cutlass-dsl==4.4.1

View File

@@ -133,7 +133,7 @@ def get_option_registry():
this._option_registry = OptionRegistry(device_cc())
return this._option_registry
this.__version__ = '4.4.0'
this.__version__ = '4.4.1'
from cutlass_cppgen.backend import create_memory_pool
from cutlass_cppgen.emit.pytorch import pytorch

View File

@@ -51,7 +51,7 @@ setup_pycute.perform_setup()
setup(
name='cutlass_cppgen',
version='4.4.0',
version='4.4.1',
description='CUTLASS Pythonic Interface',
package_dir={'': '.'},
packages=[

View File

@@ -36,7 +36,7 @@ from setuptools import setup
def perform_setup():
setup(
name='cutlass_library',
version='4.4.0',
version='4.4.1',
description='CUTLASS library generation scripts',
packages=['cutlass_library']
)

View File

@@ -36,7 +36,7 @@ from setuptools import setup
def perform_setup():
setup(
name='pycute',
version='4.4.0',
version='4.4.1',
description='Python implementation of CuTe',
packages=['pycute'],
)