v4.4.1 update (#3079)

This commit is contained in:
Junkai-Wu
2026-02-28 02:59:21 +08:00
committed by GitHub
parent c651d660d2
commit 3bb6e28d3c
13 changed files with 92 additions and 23 deletions

View File

@@ -2,6 +2,12 @@
# CUTLASS 4.x
## [4.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.1) (2026-02-27)
### CuTe DSL
* Bug fixing and improvements
- Fixed a segfault issue with tvm-ffi on aarch64
## [4.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.0) (2026-02-14)
### CuTe DSL
@@ -139,7 +145,7 @@
- Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py
### CUTLASS C++
* Work around a driver bug which will cause occasionally errors when executing kernels.
* Work around a driver TMA descriptor related bug which will cause occasional errors on Blackwell when the tensor's backing memory allocation is less than 128KB and it is not a dense non-overlapping tensor.
## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)

View File

@@ -1,9 +1,9 @@
![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
# Overview
# CUTLASS 4.4.0
# CUTLASS 4.4.1
_CUTLASS 4.4.0 - Feb 2026_
_CUTLASS 4.4.1 - Feb 2026_
CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
and related computations at all levels and scales within CUDA. It incorporates strategies for
@@ -84,6 +84,7 @@ To get started quickly - please refer :
- Fixed `cute.printf` with f-string
- Fixed an indexing issue of scalar tensor
- Fixed small K reference check error for cta_tile_n = 256 case with overlapping accumulator optimization in [Blackwell SM100 persistent dense blockscaled GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py).
- Fixed a segfault issue with tvm-ffi on aarch64
* API changes
- Deprecate get_num_tmem_alloc_cols from blackwell_helpers.py. Use the one from tmem_allocator.py instead.

View File

@@ -170,10 +170,10 @@ class CtaNorm:
print(f"[DSL INFO] pred = {pred.type}")
for i in range(cute.size(tXrX, mode=[1])):
if pred[i]:
cute.autovec_copy(tXgX[None, i], tXrX[None, i]) # LDG.128
cute.autovec_copy(tWgW[None, i], tWrW[None, i]) # LDG.128
cute.autovec_copy(tXgX[None, i], tXrX[None, i]) # Global load
cute.autovec_copy(tWgW[None, i], tWrW[None, i]) # Global load
if cutlass.const_expr(self.norm_type == "layer"):
cute.autovec_copy(tBgB[None, i], tBrB[None, i]) # LDG.128
cute.autovec_copy(tBgB[None, i], tBrB[None, i]) # Global load
if cutlass.const_expr(self.norm_type == "layer"):
tYrY = self.apply_layernorm(tXrX, tWrW, tBrB, eps, tidx, pred)
elif cutlass.const_expr(self.norm_type == "rms"):
@@ -421,4 +421,4 @@ if __name__ == "__main__":
warmup_iterations=args.warmup_iterations,
iterations=args.iterations,
)
print("\nPASS")
print("\nPASS")

View File

@@ -212,7 +212,8 @@ struct CollectiveMma<
static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
"ElementAccumulator and ElementBlockScale should be same datatype");
using NumSplitsM = cute::C<get<0>(TileShape_{}) / 128>;
// For TileShapeM < 128, NumSplitsM should be 1
using NumSplitsM = cute::conditional_t<get<0>(TileShape_{}) < _128{}, _1, cute::C<get<0>(TileShape_{}) / 128>>;
static_assert(NumSplitsM{} == 1 || NumSplitsM{} == 2);
struct SharedStorage {

View File

@@ -36,7 +36,7 @@
#define CUTLASS_MAJOR 4
#define CUTLASS_MINOR 4
#define CUTLASS_PATCH 0
#define CUTLASS_PATCH 1
#ifdef CUTLASS_VERSIONS_GENERATED
#include "cutlass/version_extended.h"

View File

@@ -371,6 +371,64 @@ class MLIRBuilder(MLIRTypeBuilder):
self.const_str_table[content] = symbol
return symbol
def get_or_load_global_func_ptr_from_text(
self,
current_block: ir.Block,
function_name: str,
) -> ir.Value:
"""Get or create a function pointer global in .text section and load it.
This creates a constant global function pointer in the .text section
(for AArch64 ADRP range compatibility) and performs a volatile load
to prevent optimization.
This forces the function pointer to be local to the code, bypassing GOT entry
ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
apart which can happen when ASLR is applied.
"""
# Check if we've already created this global
if function_name not in self.const_func_ptr_table:
symbol = f"__func_ptr_{function_name}"
module_body = self.module.body
with ir.InsertionPoint(module_body):
# 1. Create the global constant
# We use 'private' linkage so it doesn't conflict across modules
global_ptr = llvm.GlobalOp(
self.ptr_type,
symbol,
ir.Attribute.parse("#llvm.linkage<private>"),
# Initialization via block below
)
# 2. Set the necessary attributes for JIT safety and AArch64 range
# We use 'constant' to mark it as immutable
# We use 'section = ".text"' to force it into the code block
global_ptr.attributes["constant"] = ir.UnitAttr.get()
global_ptr.attributes["section"] = ir.StringAttr.get(".text")
# 3. Add a constructor block to the GlobalOp to initialize it
# with the address of the target function
initializer_block = global_ptr.initializer.blocks.append()
with ir.InsertionPoint(initializer_block):
# Get the address of the external function
func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
# Return the address as the initial value of the global
llvm.return_(arg=func_addr)
self.const_func_ptr_table[function_name] = symbol
else:
symbol = self.const_func_ptr_table[function_name]
# Load it with volatile semantics in the current block
with ir.InsertionPoint(current_block):
symbol_addr = self.address_of(symbol, self.ptr_type)
# Perform a volatile load to prevent optimization
load_op = llvm.load(self.ptr_type, symbol_addr)
# Set volatile attribute to prevent optimization
load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
return load_op
# function
def function(

View File

@@ -129,13 +129,16 @@ class TVMFFICuteCallProvider(DynamicParamPackCallProvider):
cuda_global_state_ptr = self.address_of(
self.cuda_global_state_symbol, self.ptr_type
)
cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
cuda_load_to_device_ptr = self.address_of(
"cuda_load_to_device", self.ptr_type
)
set_error_ptr = self.address_of(
"TVMFFIErrorSetRaisedFromCStr", self.ptr_type
)
cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
current_block, "cuda_init"
)
cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
current_block, "cuda_load_to_device"
)
set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
current_block, "TVMFFIErrorSetRaisedFromCStr"
)
with ir.InsertionPoint(current_block):
# Call the callback function with the loaded ptr value
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
"""TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
# initialize the tvm_ffi.Function from the current execution engine
if self.__chandle__() != 0:
raise DSLRuntimeError("TVM FFI function is already initialized")

View File

@@ -1,3 +1,3 @@
# Use `pip install -r requirements-cu13.txt` with the present file to install a
# wheel consistent with the present state of the github repository
nvidia-cutlass-dsl[cu13]==4.4.0
nvidia-cutlass-dsl[cu13]==4.4.1

View File

@@ -1,3 +1,3 @@
# Use `pip install -r requirements.txt` with the present file to install a
# wheel consistent with the present state of the github repository
nvidia-cutlass-dsl==4.4.0
nvidia-cutlass-dsl==4.4.1

View File

@@ -133,7 +133,7 @@ def get_option_registry():
this._option_registry = OptionRegistry(device_cc())
return this._option_registry
this.__version__ = '4.4.0'
this.__version__ = '4.4.1'
from cutlass_cppgen.backend import create_memory_pool
from cutlass_cppgen.emit.pytorch import pytorch

View File

@@ -51,7 +51,7 @@ setup_pycute.perform_setup()
setup(
name='cutlass_cppgen',
version='4.4.0',
version='4.4.1',
description='CUTLASS Pythonic Interface',
package_dir={'': '.'},
packages=[

View File

@@ -36,7 +36,7 @@ from setuptools import setup
def perform_setup():
setup(
name='cutlass_library',
version='4.4.0',
version='4.4.1',
description='CUTLASS library generation scripts',
packages=['cutlass_library']
)

View File

@@ -36,7 +36,7 @@ from setuptools import setup
def perform_setup():
setup(
name='pycute',
version='4.4.0',
version='4.4.1',
description='Python implementation of CuTe',
packages=['pycute'],
)