v4.3.4 update v2. (#2898)

2026-04-19 22:38:56 +00:00 · 2025-12-23 11:28:26 +08:00
parent 7f5fe3edf1
commit b7ecaa605d
4 changed files with 32 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,8 @@
 # CUTLASS 4.x

 ## [4.3.4](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.4) (2025-12-22)
+
+### CuTe DSL
 * New features
  - Added PDL support along with example [Kernel launch with Programmatic Dependent Launch](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/programmatic_dependent_launch.py)

@@ -11,7 +13,12 @@
  - Enhancement for tvm-ffi AoT case for earlier module unload
  - Fixed order issue in `make_smem_layout_a` in utils/hopper_helpers.py

+### CUTLASS C++
+* Work around a driver bug which will cause occasionally errors when executing kernels.
+
 ## [4.3.3](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.3) (2025-12-12)
+
+### CuTe DSL
 * New features
  - Supported namedtuple and kwargs for JIT function arguments in tvm-ffi
  - Supported variadic tuples for JIT function argument in tvm-ffi
@@ -21,6 +28,8 @@
  - Clearer error message for the case of runtime error cudaErrorInsufficientDriver

 ## [4.3.2](https://github.com/NVIDIA/cutlass/releases/tag/v4.3.2) (2025-12-05)
+
+### CuTe DSL
 * New features
  - New env var `CUTE_DSL_CACHE_DIR` to specify the path for dumping caches

--- a/README.md
+++ b/README.md
@@ -143,6 +143,7 @@ To get started quickly - please refer :
    - Fix a few bugs in distributed gemm API and examples.
    - Fix handling negative zero in sparse compressor.
    - Add missing `wait_on_dependent_grids` for PDL use case.
+    - Work around a driver bug which will cause occasionally errors when executing kernels.
 * Fix some profiler issues:
    - Add some missing reference kernels.
    - Support VoidC reference kernels.
--- a/include/cute/atom/copy_traits_sm90_im2col.hpp
+++ b/include/cute/atom/copy_traits_sm90_im2col.hpp
@@ -471,6 +471,17 @@ make_im2col_tma_copy_desc(
      tma_l2Promotion,
      tma_oob_fill);

+  int driver_version = 0;
+  CUresult driver_version_result = cuDriverGetVersion(&driver_version);
+  assert(driver_version_result == CUDA_SUCCESS);
+  if (driver_version <= 13010) {
+    if (cute::bits_to_bytes(
+          cute::cosize(tensor_cwhdn.layout()) *
+          cute::sizeof_bits<typename EngineA::value_type>::value) < 131072) {
+      reinterpret_cast<uint64_t*>(&tma_desc)[1] &= ~(1llu << 21);
+    }
+  }
+
  // The extra asserts help indicate the error's cause.
  assert(encode_result != CUDA_ERROR_DEINITIALIZED);
  assert(encode_result != CUDA_ERROR_NOT_INITIALIZED);
--- a/include/cute/atom/copy_traits_sm90_tma.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -1051,6 +1051,17 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,         // The origin
        smem_swizzle,
        tma_l2Promotion,
        tma_oobFill);
+   
+    int driver_version = 0;
+    CUresult driver_version_result = cuDriverGetVersion(&driver_version);
+    assert(driver_version_result == CUDA_SUCCESS);
+    if (driver_version <= 13010) {      
+      if (cute::bits_to_bytes(
+            cute::cosize(gtensor.layout()) *
+            cute::sizeof_bits<typename GEngine::value_type>::value) < 131072) {
+        reinterpret_cast<uint64_t*>(&tma_desc)[1] &= ~(1llu << 21);
+      }
+    }

    if (result != CUDA_SUCCESS) {
      std::cerr << "TMA Desc Addr:   " << &tma_desc