Release v4.0.0 (#2294)

2026-04-20 14:59:01 +00:00 · 2025-05-13 15:55:29 -04:00
parent ad7b2f5e84
commit f115c3f854
299 changed files with 51495 additions and 4413 deletions
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@@ -375,6 +375,13 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level:
    mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
    for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):

+        # generator can stamp out duplicate kernels, because it doesn't explicitly set instruction
+        # shape for SM90 kernels, and the 3.X collective API doesn't directly expose them when using
+        # the auto kernel schedule.
+
+        math_inst_stub = copy.deepcopy(math_inst)
+        math_inst_stub.instruction_shape = [0, 0, 0]
+
        tile_desc = TileDescription(
            threadblock_shape=[
                math_inst.instruction_shape[0] * mma_mul[0],
@@ -383,7 +390,7 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level:
            ],
            stages=0,
            warp_count=[4, 1, 1],
-            math_instruction=math_inst,
+            math_instruction=math_inst_stub,
            min_compute=90,
            max_compute=90,
            cluster_shape=cluster_size)
@@ -551,6 +558,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
    b_type_size = DataTypeSize[data_types["b_type"]]
    if a_type_size != b_type_size and CudaToolkitVersionSatisfies(cuda_version, 12, 1):
        schedules = []
+        stream_k_schedules = []
        epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
        if a_type_size > b_type_size:
            epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
@@ -579,7 +587,11 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
                    KernelScheduleType.TmaWarpSpecializedCooperative,
                    epilogue_schedule
                ])
-        return schedules, []
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+        return schedules, stream_k_schedules

    if not is_aligned and not is_blockwise(gemm_kind):
        schedules = [[KernelScheduleType.CpAsyncWarpSpecialized,