CUTLASS 3.3.0 (#1167)

* Release 3.3.0 Adds support for mixed precision GEMMs On Hopper and Ampere Adds support for < 16B aligned GEMMs on Hopper Enhancements to EVT Enhancements to Python interface Enhancements to Sub-byte type handling in CuTe Several other bug-fixes and performance improvements. * minor doc update
2026-05-12 09:15:56 +00:00 · 2023-11-02 08:09:05 -07:00
parent 922fb5108b
commit c008b4aea8
263 changed files with 16214 additions and 5008 deletions
--- a/test/python/cutlass/interface/conv2d_interface.py
+++ b/test/python/cutlass/interface/conv2d_interface.py
@@ -50,7 +50,7 @@ class Conv2dEquivalence:
    """
    def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
                 alignment_A, alignment_B, alignment_C):
-        
+
        self.element_A = element_A
        self.element_B = element_B
        self.element_C = element_C
@@ -59,21 +59,21 @@ class Conv2dEquivalence:
        self.alignment_A = alignment_A
        self.alignment_B = alignment_B
        self.alignment_C = alignment_C
-        
+
        self.conv_kind = conv_kind
-        
+
        self.plan = cutlass.op.Conv2d(
            kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
            element_D=element_D, element_accumulator=element_accumulator)
-        
+
        self.op = self.plan.construct(
-            alignment_A=self.alignment_A, alignment_B=self.alignment_B, 
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
            alignment_C=self.alignment_C)
-    
+
    def _plans_equal(self, other_plan) -> bool:
        """
        Compares whether two plans are equal
-        
+
        :param other_plan: plan to compare against the default Conv2d
        :type other_plan: cutlass.op.Conv2d

@@ -81,9 +81,9 @@ class Conv2dEquivalence:
        :rtype: bool
        """
        other_op = other_plan.construct(
-            alignment_A=self.alignment_A, alignment_B=self.alignment_B, 
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
            alignment_C=self.alignment_C)
-        
+
        return self.op.rt_module.emit() == other_op.rt_module.emit()

    def generic_test(self):
@@ -91,16 +91,16 @@ class Conv2dEquivalence:
        Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
        and layouts for constructing the Conv2d interface
        """
-        if not datatypes.numpy_available:
+        if not datatypes.is_numpy_available():
            return
-        
+
        # Test when specifying all parameters
        plan_other = cutlass.op.Conv2d(
            kind=self.conv_kind,
            element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
            element_D=self.element_D, element_accumulator=self.element_accumulator)
        assert self._plans_equal(plan_other)
-        
+
        # Test when specifying all parameters but A
        plan_other = cutlass.op.Conv2d(
            kind=self.conv_kind,
@@ -108,7 +108,7 @@ class Conv2dEquivalence:
            element_D=self.element_D, element_accumulator=self.element_accumulator,
            element=self.element_A)
        assert self._plans_equal(plan_other)
-        
+
        # Test when specifying all parameters but A and B as tensors using generic element and output
        plan_other = cutlass.op.Conv2d(
            kind=self.conv_kind,
@@ -116,7 +116,7 @@ class Conv2dEquivalence:
            element_D=self.element_D, element_accumulator=self.element_accumulator,
            element=self.element_A)
        assert self._plans_equal(plan_other)
-        
+
        # Test without explicit accumulator. Only run if the type of C and the accumulator are equal
        if self.element_C == self.element_accumulator:
            plan_other = cutlass.op.Conv2d(
@@ -125,18 +125,18 @@ class Conv2dEquivalence:
                element_D=self.element_D,
                element=self.element_A)
            assert self._plans_equal(plan_other)
-        
+
        # Test with only the generic types. Only rune if the types of A, B, C, and D are the same
        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
            and self.element_A == self.element_accumulator):
            plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
            assert self._plans_equal(plan_other)
-    
+
    def numpy_test(self):
        """
        Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
        """
-        if not datatypes.numpy_available:
+        if not datatypes.is_numpy_available():
            return

        import numpy as np
@@ -145,7 +145,7 @@ class Conv2dEquivalence:
        type_C = datatypes.numpy_type(self.element_C)
        type_D = datatypes.numpy_type(self.element_D)
        type_accum = datatypes.numpy_type(self.element_accumulator)
-        
+
        size = (2, 2)
        A = np.zeros(size, dtype=type_A)
        B = np.zeros(size, dtype=type_B)
@@ -153,49 +153,49 @@ class Conv2dEquivalence:
        D = np.zeros(size, dtype=type_D)

        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
-    
+
    def torch_test(self):
        """
        Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
        """
-        if not datatypes.torch_available:
+        if not datatypes.is_torch_available():
            return
-        
+
        import torch
        type_A = datatypes.torch_type(self.element_A)
        type_B = datatypes.torch_type(self.element_B)
        type_C = datatypes.torch_type(self.element_C)
        type_D = datatypes.torch_type(self.element_D)
        type_accum = datatypes.torch_type(self.element_accumulator)
-        
+
        size = (2, 2)
-        
+
        A = torch.empty(size, dtype=type_A)
        B = torch.empty(size, dtype=type_B)
        C = torch.empty(size, dtype=type_C)
        D = torch.empty(size, dtype=type_D)
-        
+
        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
-    
+
    def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
        # Test when specifying all parameters via tensors
        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
        assert self._plans_equal(plan_np)
-        
+
        # Test when specifying all parameters but A as tensors
        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
        assert self._plans_equal(plan_np)
-        
+
        # Test when specifying all parameters but A and B as tensors and using generic element and output
        if type_A == type_B:
            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
            assert self._plans_equal(plan_np)
-        
+
        # Test without explicit accumulator. Only run if the type of C and the accumulator.
        if type_C == type_accum:
            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
            assert self._plans_equal(plan_np)
-        
+
        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
@@ -223,20 +223,20 @@ type2alignment = {
 }

 def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
-    
+
    test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
-    
+
    def run(self):
        conv2d_eq = Conv2dEquivalence(
-            conv_kind=conv_kind, 
+            conv_kind=conv_kind,
            element_A=element_A, element_B=element_B,
            element_C=element_C, element_D=element_D,
-            element_accumulator=element_accumulator, 
+            element_accumulator=element_accumulator,
            alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
            alignment_C=type2alignment[element_C]
        )
        conv2d_eq.test_all()
-    
+
    setattr(ConvEquivalenceTest, test_name, run)

 for conv_kind in ["fprop", "wgrad", "dgrad"]:
@@ -255,25 +255,25 @@ class Conv2dErrorTests(unittest.TestCase):
    """
    Tests various error scenarios that arise with the high-level Gemm interface
    """
-    
+
    def test_alignment(self):
        """
        Tests case in which the alignment specified is unsupported
        """
        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
-        
+
        with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
            op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
-    
+
    def test_invalid_tile_description(self):
        """
        Tests scenarios in which an invalid tile description is provided for a given CC
        """
        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
-        
+
        td = plan.tile_descriptions()[0]
        td.threadblock_shape=[17, 32, 5]
-        
+
        plan.tile_description = td
        with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
            plan.compile()
--- a/test/python/cutlass/interface/evt_interface.py
+++ b/test/python/cutlass/interface/evt_interface.py
@@ -93,13 +93,16 @@ class EVTErrorTests(unittest.TestCase):
        """
        Test when the epilogue consumes too much shared memory
        """
-        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5, C6, C7, C8):
            D1 = accum + C1
            D2 = D1 + C2
            D3 = D2 + C3
            D4 = D3 + C4
-            D = D4 + C5
-            return D, D1, D2, D3, D4
+            D5 = D4 + C5
+            D6 = D5 + C6
+            D7 = D6 + C7
+            D = D7 + C8
+            return D, D1, D2, D3, D4, D5, D6, D7
        
        example_tensors = {
            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
@@ -108,10 +111,16 @@ class EVTErrorTests(unittest.TestCase):
            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C8": self.fake_tensor(np.float16, (6, 512, 512)),
            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D7": self.fake_tensor(np.float16, (6, 512, 512)),
            "D": self.fake_tensor(np.float16, (6, 512, 512))
        }
        
--- a/test/python/cutlass/interface/gemm_interface.py
+++ b/test/python/cutlass/interface/gemm_interface.py
@@ -85,7 +85,7 @@ class GemmEquivalence:
        Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
        and layouts for constructing the Gemm interface
        """
-        if not datatypes.numpy_available:
+        if not datatypes.is_numpy_available():
            return

        # Test when specifying all parameters
@@ -126,7 +126,7 @@ class GemmEquivalence:
        """
        Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
        """
-        if not datatypes.numpy_available:
+        if not datatypes.is_numpy_available():
            return

        import numpy as np