mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-04-19 22:38:56 +00:00
v4.4.1 update (#3079)
This commit is contained in:
@@ -371,6 +371,64 @@ class MLIRBuilder(MLIRTypeBuilder):
|
||||
self.const_str_table[content] = symbol
|
||||
return symbol
|
||||
|
||||
def get_or_load_global_func_ptr_from_text(
|
||||
self,
|
||||
current_block: ir.Block,
|
||||
function_name: str,
|
||||
) -> ir.Value:
|
||||
"""Get or create a function pointer global in .text section and load it.
|
||||
|
||||
This creates a constant global function pointer in the .text section
|
||||
(for AArch64 ADRP range compatibility) and performs a volatile load
|
||||
to prevent optimization.
|
||||
|
||||
This forces the function pointer to be local to the code, bypassing GOT entry
|
||||
ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
|
||||
apart which can happen when ASLR is applied.
|
||||
"""
|
||||
# Check if we've already created this global
|
||||
if function_name not in self.const_func_ptr_table:
|
||||
symbol = f"__func_ptr_{function_name}"
|
||||
|
||||
module_body = self.module.body
|
||||
with ir.InsertionPoint(module_body):
|
||||
# 1. Create the global constant
|
||||
# We use 'private' linkage so it doesn't conflict across modules
|
||||
global_ptr = llvm.GlobalOp(
|
||||
self.ptr_type,
|
||||
symbol,
|
||||
ir.Attribute.parse("#llvm.linkage<private>"),
|
||||
# Initialization via block below
|
||||
)
|
||||
|
||||
# 2. Set the necessary attributes for JIT safety and AArch64 range
|
||||
# We use 'constant' to mark it as immutable
|
||||
# We use 'section = ".text"' to force it into the code block
|
||||
global_ptr.attributes["constant"] = ir.UnitAttr.get()
|
||||
global_ptr.attributes["section"] = ir.StringAttr.get(".text")
|
||||
|
||||
# 3. Add a constructor block to the GlobalOp to initialize it
|
||||
# with the address of the target function
|
||||
initializer_block = global_ptr.initializer.blocks.append()
|
||||
with ir.InsertionPoint(initializer_block):
|
||||
# Get the address of the external function
|
||||
func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
|
||||
# Return the address as the initial value of the global
|
||||
llvm.return_(arg=func_addr)
|
||||
|
||||
self.const_func_ptr_table[function_name] = symbol
|
||||
else:
|
||||
symbol = self.const_func_ptr_table[function_name]
|
||||
|
||||
# Load it with volatile semantics in the current block
|
||||
with ir.InsertionPoint(current_block):
|
||||
symbol_addr = self.address_of(symbol, self.ptr_type)
|
||||
# Perform a volatile load to prevent optimization
|
||||
load_op = llvm.load(self.ptr_type, symbol_addr)
|
||||
# Set volatile attribute to prevent optimization
|
||||
load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
|
||||
return load_op
|
||||
|
||||
|
||||
# function
|
||||
def function(
|
||||
|
||||
@@ -129,13 +129,16 @@ class TVMFFICuteCallProvider(DynamicParamPackCallProvider):
|
||||
cuda_global_state_ptr = self.address_of(
|
||||
self.cuda_global_state_symbol, self.ptr_type
|
||||
)
|
||||
cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
|
||||
cuda_load_to_device_ptr = self.address_of(
|
||||
"cuda_load_to_device", self.ptr_type
|
||||
)
|
||||
set_error_ptr = self.address_of(
|
||||
"TVMFFIErrorSetRaisedFromCStr", self.ptr_type
|
||||
)
|
||||
|
||||
cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
|
||||
current_block, "cuda_init"
|
||||
)
|
||||
cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
|
||||
current_block, "cuda_load_to_device"
|
||||
)
|
||||
set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
|
||||
current_block, "TVMFFIErrorSetRaisedFromCStr"
|
||||
)
|
||||
|
||||
with ir.InsertionPoint(current_block):
|
||||
# Call the callback function with the loaded ptr value
|
||||
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
|
||||
"""TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
|
||||
# initialize the tvm_ffi.Function from the current execution engine
|
||||
if self.__chandle__() != 0:
|
||||
raise DSLRuntimeError("TVM FFI function is already initialized")
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# Use `pip install -r requirements-cu13.txt` with the present file to install a
|
||||
# wheel consistent with the present state of the github repository
|
||||
nvidia-cutlass-dsl[cu13]==4.4.0
|
||||
nvidia-cutlass-dsl[cu13]==4.4.1
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# Use `pip install -r requirements.txt` with the present file to install a
|
||||
# wheel consistent with the present state of the github repository
|
||||
nvidia-cutlass-dsl==4.4.0
|
||||
nvidia-cutlass-dsl==4.4.1
|
||||
|
||||
@@ -133,7 +133,7 @@ def get_option_registry():
|
||||
this._option_registry = OptionRegistry(device_cc())
|
||||
return this._option_registry
|
||||
|
||||
this.__version__ = '4.4.0'
|
||||
this.__version__ = '4.4.1'
|
||||
|
||||
from cutlass_cppgen.backend import create_memory_pool
|
||||
from cutlass_cppgen.emit.pytorch import pytorch
|
||||
|
||||
@@ -51,7 +51,7 @@ setup_pycute.perform_setup()
|
||||
|
||||
setup(
|
||||
name='cutlass_cppgen',
|
||||
version='4.4.0',
|
||||
version='4.4.1',
|
||||
description='CUTLASS Pythonic Interface',
|
||||
package_dir={'': '.'},
|
||||
packages=[
|
||||
|
||||
@@ -36,7 +36,7 @@ from setuptools import setup
|
||||
def perform_setup():
|
||||
setup(
|
||||
name='cutlass_library',
|
||||
version='4.4.0',
|
||||
version='4.4.1',
|
||||
description='CUTLASS library generation scripts',
|
||||
packages=['cutlass_library']
|
||||
)
|
||||
|
||||
@@ -36,7 +36,7 @@ from setuptools import setup
|
||||
def perform_setup():
|
||||
setup(
|
||||
name='pycute',
|
||||
version='4.4.0',
|
||||
version='4.4.1',
|
||||
description='Python implementation of CuTe',
|
||||
packages=['pycute'],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user