mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-04-20 06:48:59 +00:00
v4.4.1 update (#3079)
This commit is contained in:
@@ -2,6 +2,12 @@
|
||||
|
||||
# CUTLASS 4.x
|
||||
|
||||
## [4.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.1) (2026-02-27)
|
||||
|
||||
### CuTe DSL
|
||||
* Bug fixing and improvements
|
||||
- Fixed a segfault issue with tvm-ffi on aarch64
|
||||
|
||||
## [4.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.4.0) (2026-02-14)
|
||||
|
||||
### CuTe DSL
|
||||
@@ -139,7 +145,7 @@
|
||||
- Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py
|
||||
|
||||
### CUTLASS C++
|
||||
* Work around a driver bug which will cause occasionally errors when executing kernels.
|
||||
* Work around a driver TMA descriptor related bug which will cause occasional errors on Blackwell when the tensor's backing memory allocation is less than 128KB and it is not a dense non-overlapping tensor.
|
||||
|
||||
## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||

|
||||
# Overview
|
||||
|
||||
# CUTLASS 4.4.0
|
||||
# CUTLASS 4.4.1
|
||||
|
||||
_CUTLASS 4.4.0 - Feb 2026_
|
||||
_CUTLASS 4.4.1 - Feb 2026_
|
||||
|
||||
CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
|
||||
and related computations at all levels and scales within CUDA. It incorporates strategies for
|
||||
@@ -84,6 +84,7 @@ To get started quickly - please refer :
|
||||
- Fixed `cute.printf` with f-string
|
||||
- Fixed an indexing issue of scalar tensor
|
||||
- Fixed small K reference check error for cta_tile_n = 256 case with overlapping accumulator optimization in [Blackwell SM100 persistent dense blockscaled GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py).
|
||||
- Fixed a segfault issue with tvm-ffi on aarch64
|
||||
|
||||
* API changes
|
||||
- Deprecate get_num_tmem_alloc_cols from blackwell_helpers.py. Use the one from tmem_allocator.py instead.
|
||||
|
||||
@@ -170,10 +170,10 @@ class CtaNorm:
|
||||
print(f"[DSL INFO] pred = {pred.type}")
|
||||
for i in range(cute.size(tXrX, mode=[1])):
|
||||
if pred[i]:
|
||||
cute.autovec_copy(tXgX[None, i], tXrX[None, i]) # LDG.128
|
||||
cute.autovec_copy(tWgW[None, i], tWrW[None, i]) # LDG.128
|
||||
cute.autovec_copy(tXgX[None, i], tXrX[None, i]) # Global load
|
||||
cute.autovec_copy(tWgW[None, i], tWrW[None, i]) # Global load
|
||||
if cutlass.const_expr(self.norm_type == "layer"):
|
||||
cute.autovec_copy(tBgB[None, i], tBrB[None, i]) # LDG.128
|
||||
cute.autovec_copy(tBgB[None, i], tBrB[None, i]) # Global load
|
||||
if cutlass.const_expr(self.norm_type == "layer"):
|
||||
tYrY = self.apply_layernorm(tXrX, tWrW, tBrB, eps, tidx, pred)
|
||||
elif cutlass.const_expr(self.norm_type == "rms"):
|
||||
@@ -421,4 +421,4 @@ if __name__ == "__main__":
|
||||
warmup_iterations=args.warmup_iterations,
|
||||
iterations=args.iterations,
|
||||
)
|
||||
print("\nPASS")
|
||||
print("\nPASS")
|
||||
|
||||
@@ -212,7 +212,8 @@ struct CollectiveMma<
|
||||
|
||||
static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
|
||||
"ElementAccumulator and ElementBlockScale should be same datatype");
|
||||
using NumSplitsM = cute::C<get<0>(TileShape_{}) / 128>;
|
||||
// For TileShapeM < 128, NumSplitsM should be 1
|
||||
using NumSplitsM = cute::conditional_t<get<0>(TileShape_{}) < _128{}, _1, cute::C<get<0>(TileShape_{}) / 128>>;
|
||||
static_assert(NumSplitsM{} == 1 || NumSplitsM{} == 2);
|
||||
|
||||
struct SharedStorage {
|
||||
|
||||
@@ -36,7 +36,7 @@
|
||||
|
||||
#define CUTLASS_MAJOR 4
|
||||
#define CUTLASS_MINOR 4
|
||||
#define CUTLASS_PATCH 0
|
||||
#define CUTLASS_PATCH 1
|
||||
|
||||
#ifdef CUTLASS_VERSIONS_GENERATED
|
||||
#include "cutlass/version_extended.h"
|
||||
|
||||
@@ -371,6 +371,64 @@ class MLIRBuilder(MLIRTypeBuilder):
|
||||
self.const_str_table[content] = symbol
|
||||
return symbol
|
||||
|
||||
def get_or_load_global_func_ptr_from_text(
|
||||
self,
|
||||
current_block: ir.Block,
|
||||
function_name: str,
|
||||
) -> ir.Value:
|
||||
"""Get or create a function pointer global in .text section and load it.
|
||||
|
||||
This creates a constant global function pointer in the .text section
|
||||
(for AArch64 ADRP range compatibility) and performs a volatile load
|
||||
to prevent optimization.
|
||||
|
||||
This forces the function pointer to be local to the code, bypassing GOT entry
|
||||
ADRP lookup issues on AArch64 when GOT and .text section are more than 4GB
|
||||
apart which can happen when ASLR is applied.
|
||||
"""
|
||||
# Check if we've already created this global
|
||||
if function_name not in self.const_func_ptr_table:
|
||||
symbol = f"__func_ptr_{function_name}"
|
||||
|
||||
module_body = self.module.body
|
||||
with ir.InsertionPoint(module_body):
|
||||
# 1. Create the global constant
|
||||
# We use 'private' linkage so it doesn't conflict across modules
|
||||
global_ptr = llvm.GlobalOp(
|
||||
self.ptr_type,
|
||||
symbol,
|
||||
ir.Attribute.parse("#llvm.linkage<private>"),
|
||||
# Initialization via block below
|
||||
)
|
||||
|
||||
# 2. Set the necessary attributes for JIT safety and AArch64 range
|
||||
# We use 'constant' to mark it as immutable
|
||||
# We use 'section = ".text"' to force it into the code block
|
||||
global_ptr.attributes["constant"] = ir.UnitAttr.get()
|
||||
global_ptr.attributes["section"] = ir.StringAttr.get(".text")
|
||||
|
||||
# 3. Add a constructor block to the GlobalOp to initialize it
|
||||
# with the address of the target function
|
||||
initializer_block = global_ptr.initializer.blocks.append()
|
||||
with ir.InsertionPoint(initializer_block):
|
||||
# Get the address of the external function
|
||||
func_addr = llvm.AddressOfOp(self.ptr_type, function_name).res
|
||||
# Return the address as the initial value of the global
|
||||
llvm.return_(arg=func_addr)
|
||||
|
||||
self.const_func_ptr_table[function_name] = symbol
|
||||
else:
|
||||
symbol = self.const_func_ptr_table[function_name]
|
||||
|
||||
# Load it with volatile semantics in the current block
|
||||
with ir.InsertionPoint(current_block):
|
||||
symbol_addr = self.address_of(symbol, self.ptr_type)
|
||||
# Perform a volatile load to prevent optimization
|
||||
load_op = llvm.load(self.ptr_type, symbol_addr)
|
||||
# Set volatile attribute to prevent optimization
|
||||
load_op.owner.attributes["volatile_"] = ir.UnitAttr.get()
|
||||
return load_op
|
||||
|
||||
|
||||
# function
|
||||
def function(
|
||||
|
||||
@@ -129,13 +129,16 @@ class TVMFFICuteCallProvider(DynamicParamPackCallProvider):
|
||||
cuda_global_state_ptr = self.address_of(
|
||||
self.cuda_global_state_symbol, self.ptr_type
|
||||
)
|
||||
cuda_init_ptr = self.address_of("cuda_init", self.ptr_type)
|
||||
cuda_load_to_device_ptr = self.address_of(
|
||||
"cuda_load_to_device", self.ptr_type
|
||||
)
|
||||
set_error_ptr = self.address_of(
|
||||
"TVMFFIErrorSetRaisedFromCStr", self.ptr_type
|
||||
)
|
||||
|
||||
cuda_init_ptr = context.builder.get_or_load_global_func_ptr_from_text(
|
||||
current_block, "cuda_init"
|
||||
)
|
||||
cuda_load_to_device_ptr = context.builder.get_or_load_global_func_ptr_from_text(
|
||||
current_block, "cuda_load_to_device"
|
||||
)
|
||||
set_error_ptr = context.builder.get_or_load_global_func_ptr_from_text(
|
||||
current_block, "TVMFFIErrorSetRaisedFromCStr"
|
||||
)
|
||||
|
||||
with ir.InsertionPoint(current_block):
|
||||
# Call the callback function with the loaded ptr value
|
||||
@@ -530,7 +533,7 @@ class TVMFFIJitCompiledFunction(tvm_ffi.Function, TVMFFIJitCompiledFunctionBase)
|
||||
"""TVM FFI Function that directly subclasses the tvm_ffi.Function for pos only arguments."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
TVMFFIJitCompiledFunctionBase.__init__(self, *args, **kwargs)
|
||||
# initialize the tvm_ffi.Function from the current execution engine
|
||||
if self.__chandle__() != 0:
|
||||
raise DSLRuntimeError("TVM FFI function is already initialized")
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# Use `pip install -r requirements-cu13.txt` with the present file to install a
|
||||
# wheel consistent with the present state of the github repository
|
||||
nvidia-cutlass-dsl[cu13]==4.4.0
|
||||
nvidia-cutlass-dsl[cu13]==4.4.1
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# Use `pip install -r requirements.txt` with the present file to install a
|
||||
# wheel consistent with the present state of the github repository
|
||||
nvidia-cutlass-dsl==4.4.0
|
||||
nvidia-cutlass-dsl==4.4.1
|
||||
|
||||
@@ -133,7 +133,7 @@ def get_option_registry():
|
||||
this._option_registry = OptionRegistry(device_cc())
|
||||
return this._option_registry
|
||||
|
||||
this.__version__ = '4.4.0'
|
||||
this.__version__ = '4.4.1'
|
||||
|
||||
from cutlass_cppgen.backend import create_memory_pool
|
||||
from cutlass_cppgen.emit.pytorch import pytorch
|
||||
|
||||
@@ -51,7 +51,7 @@ setup_pycute.perform_setup()
|
||||
|
||||
setup(
|
||||
name='cutlass_cppgen',
|
||||
version='4.4.0',
|
||||
version='4.4.1',
|
||||
description='CUTLASS Pythonic Interface',
|
||||
package_dir={'': '.'},
|
||||
packages=[
|
||||
|
||||
@@ -36,7 +36,7 @@ from setuptools import setup
|
||||
def perform_setup():
|
||||
setup(
|
||||
name='cutlass_library',
|
||||
version='4.4.0',
|
||||
version='4.4.1',
|
||||
description='CUTLASS library generation scripts',
|
||||
packages=['cutlass_library']
|
||||
)
|
||||
|
||||
@@ -36,7 +36,7 @@ from setuptools import setup
|
||||
def perform_setup():
|
||||
setup(
|
||||
name='pycute',
|
||||
version='4.4.0',
|
||||
version='4.4.1',
|
||||
description='Python implementation of CuTe',
|
||||
packages=['pycute'],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user