mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-04-19 22:38:56 +00:00
v4.3.4 update v2. (#2898)
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
# CUTLASS 4.x
|
||||
|
||||
## [4.3.4](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.4) (2025-12-22)
|
||||
|
||||
### CuTe DSL
|
||||
* New features
|
||||
- Added PDL support along with example [Kernel launch with Programmatic Dependent Launch](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/programmatic_dependent_launch.py)
|
||||
|
||||
@@ -11,7 +13,12 @@
|
||||
- Enhancement for tvm-ffi AoT case for earlier module unload
|
||||
- Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py
|
||||
|
||||
### CUTLASS C++
|
||||
* Work around a driver bug which will cause occasionally errors when executing kernels.
|
||||
|
||||
## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)
|
||||
|
||||
### CuTe DSL
|
||||
* New features
|
||||
- Supported namedtuple and kwargs for JIT function arguments in tvm-ffi
|
||||
- Supported variadic tuples for JIT function argument in tvm-ffi
|
||||
@@ -21,6 +28,8 @@
|
||||
- Clearer error message for the case of runtime error cudaErrorInsufficientDriver
|
||||
|
||||
## [4.3.2](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.2) (2025-12-05)
|
||||
|
||||
### CuTe DSL
|
||||
* New features
|
||||
- New env var `CUTE_DSL_CACHE_DIR` to specify the path for dumping caches
|
||||
|
||||
|
||||
@@ -143,6 +143,7 @@ To get started quickly - please refer :
|
||||
- Fix a few bugs in distributed gemm API and examples.
|
||||
- Fix handling negative zero in sparse compressor.
|
||||
- Add missing `wait_on_dependent_grids` for PDL use case.
|
||||
- Work around a driver bug which will cause occasionally errors when executing kernels.
|
||||
* Fix some profiler issues:
|
||||
- Add some missing reference kernels.
|
||||
- Support VoidC reference kernels.
|
||||
|
||||
@@ -471,6 +471,17 @@ make_im2col_tma_copy_desc(
|
||||
tma_l2Promotion,
|
||||
tma_oob_fill);
|
||||
|
||||
int driver_version = 0;
|
||||
CUresult driver_version_result = cuDriverGetVersion(&driver_version);
|
||||
assert(driver_version_result == CUDA_SUCCESS);
|
||||
if (driver_version <= 13010) {
|
||||
if (cute::bits_to_bytes(
|
||||
cute::cosize(tensor_cwhdn.layout()) *
|
||||
cute::sizeof_bits<typename EngineA::value_type>::value) < 131072) {
|
||||
reinterpret_cast<uint64_t*>(&tma_desc)[1] &= ~(1llu << 21);
|
||||
}
|
||||
}
|
||||
|
||||
// The extra asserts help indicate the error's cause.
|
||||
assert(encode_result != CUDA_ERROR_DEINITIALIZED);
|
||||
assert(encode_result != CUDA_ERROR_NOT_INITIALIZED);
|
||||
|
||||
@@ -1051,6 +1051,17 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor, // The origin
|
||||
smem_swizzle,
|
||||
tma_l2Promotion,
|
||||
tma_oobFill);
|
||||
|
||||
int driver_version = 0;
|
||||
CUresult driver_version_result = cuDriverGetVersion(&driver_version);
|
||||
assert(driver_version_result == CUDA_SUCCESS);
|
||||
if (driver_version <= 13010) {
|
||||
if (cute::bits_to_bytes(
|
||||
cute::cosize(gtensor.layout()) *
|
||||
cute::sizeof_bits<typename GEngine::value_type>::value) < 131072) {
|
||||
reinterpret_cast<uint64_t*>(&tma_desc)[1] &= ~(1llu << 21);
|
||||
}
|
||||
}
|
||||
|
||||
if (result != CUDA_SUCCESS) {
|
||||
std::cerr << "TMA Desc Addr: " << &tma_desc
|
||||
|
||||
Reference in New Issue
Block a user