CUTLASS 3.3.0 (#1167)

* Release 3.3.0 Adds support for mixed precision GEMMs On Hopper and Ampere Adds support for < 16B aligned GEMMs on Hopper Enhancements to EVT Enhancements to Python interface Enhancements to Sub-byte type handling in CuTe Several other bug-fixes and performance improvements. * minor doc update
2026-05-12 17:25:45 +00:00 · 2023-11-02 08:09:05 -07:00
parent 922fb5108b
commit c008b4aea8
263 changed files with 16214 additions and 5008 deletions
--- a/python/cutlass/init.py
+++ b/python/cutlass/init.py
@@ -37,14 +37,6 @@ import sys
 import cutlass_library


-def _cutlass_path_from_dir() -> str:
-    cutlass_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../')
-    if not os.path.isdir(cutlass_path):
-        raise Exception(f'Environment variable "CUTLASS_PATH" is not defined, '
-                        f'and default path of {cutlass_path} does not exist.')
-    return cutlass_path
-
-
 def _cuda_install_path_from_nvcc() -> str:
    import subprocess
    # Attempt to detect CUDA_INSTALL_PATH based on location of NVCC
@@ -60,66 +52,41 @@ def _cuda_install_path_from_nvcc() -> str:
    return cuda_install_path


-CUTLASS_PATH = os.getenv("CUTLASS_PATH", _cutlass_path_from_dir())
-CUDA_INSTALL_PATH = os.getenv("CUDA_INSTALL_PATH", _cuda_install_path_from_nvcc())
+CUTLASS_PATH = os.getenv("CUTLASS_PATH", cutlass_library.source_path)
+
+# Alias CUTLASS_PATH as source_path
+source_path = CUTLASS_PATH
+
+_CUDA_INSTALL_PATH = None
+def cuda_install_path():
+    """
+    Helper method for on-demand fetching of the CUDA installation path. This allows
+    the import of CUTLASS to proceed even if NVCC is not available, preferring to
+    raise this error only when an operation that needs NVCC is being performed.
+    """
+    global _CUDA_INSTALL_PATH
+    if _CUDA_INSTALL_PATH is None:
+        _CUDA_INSTALL_PATH = os.getenv("CUDA_INSTALL_PATH", _cuda_install_path_from_nvcc())
+    return _CUDA_INSTALL_PATH
+
 CACHE_FILE = "compiled_cache.db"

-# Import types/methods from the CUTLASS utility libraries for profiler generation/emission under
-from cutlass_library.library import (
-    ArchitectureNames,
-    ComplexTransform,
-    ComplexTransformTag,
-    ConvKind,
-    ConvKindNames,
-    ConvKindTag,
-    ConvMode,
+from cutlass_library import (
    DataType,
-    DataTypeNames,
-    DataTypeSize,
-    DataTypeTag,
-    EpilogueFunctor,
-    EpilogueScheduleSuffixes,
-    EpilogueScheduleTag,
    EpilogueScheduleType,
-    GemmKind,
-    GemmKindNames,
-    GemmUniversalMode,
-    IteratorAlgorithm,
-    IteratorAlgorithmNames,
-    IteratorAlgorithmTag,
-    LayoutTag,
-    LayoutType,
-    KernelScheduleSuffixes,
-    KernelScheduleTag,
    KernelScheduleType,
-    MathInstruction,
-    MathOperation,
-    MathOperationTag,
+    LayoutType,
    OpcodeClass,
-    OpcodeClassNames,
-    OpcodeClassTag,
-    OperationKind,
-    SharedMemPerCC,
-    ShortComplexLayoutNames,
-    ShortDataTypeNames,
-    ShortLayoutTypeNames,
-    SplitKMode,
-    StrideSupport,
-    StrideSupportNames,
-    StrideSupportTag,
-    SwizzlingFunctor,
-    SwizzlingFunctorTag,
-    TensorDescription,
    TileDescription,
-    TileSchedulerSuffixes,
-    TileSchedulerTag,
    TileSchedulerType,
-    get_complex_from_real,
 )

 this = sys.modules[__name__]
 this.logger = logging.getLogger(__name__)

+# RMM is only supported for Python 3.9+
+this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3
+
 def set_log_level(level: int):
    """
    Sets the log level
@@ -134,11 +101,20 @@ set_log_level(logging.ERROR)
 from cutlass.library_defaults import OptionRegistry
 from cutlass.backend.utils.device import device_cc

-this.option_registry = OptionRegistry(device_cc())
+this._option_registry = None
+def get_option_registry():
+    """
+    Helper method for on-demand initialization of the options registry. This avoids building
+    the registry when CUTLASS is imported.
+    """
+    if this._option_registry is None:
+        this.logger.info("Initializing option registry")
+        this._option_registry = OptionRegistry(device_cc())
+    return this._option_registry

-this.__version__ = '3.2.1'
+this.__version__ = '3.3.0'

-from cutlass.backend import get_memory_pool
+from cutlass.backend import create_memory_pool
 from cutlass.emit.pytorch import pytorch
 from cutlass.op.gemm import Gemm
 from cutlass.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
@@ -146,4 +122,58 @@ from cutlass.op.gemm_grouped import GroupedGemm
 from cutlass.op.op import OperationBase
 from cutlass.backend.evt.ir.tensor import Tensor

-get_memory_pool(init_pool_size=2 ** 30, max_pool_size=2 ** 32)
+
+this.memory_pool = None
+def get_memory_pool():
+    """"
+    Helper method for on-demand memory pool. This avoids allocating the memory pool unnecessarily
+    whe CUTLASS is imported.
+    """
+    if this.use_rmm and this.memory_pool is None:
+        this.memory_pool = create_memory_pool(init_pool_size=2 ** 30, max_pool_size=2 ** 32)
+    return this.memory_pool
+
+
+from cuda import cuda
+
+this._context = None
+this._device_id = None
+def initialize_cuda_context():
+    if this._device_id is not None:
+        return
+
+    if this.use_rmm:
+        # This also covers initializing the CUDA context
+        get_memory_pool()
+
+    device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
+    if device_id is None:
+        if not this.use_rmm:
+            # We must manually call cuInit in the absence of RMM
+            err, = cuda.cuInit(0)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise Exception(f"cuInit failed with error {err}")
+
+        err, device_count = cuda.cuDeviceGetCount()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuDeviceGetCount failed with error {err}")
+        if device_count <= 0:
+            raise Exception("No CUDA devices found")
+        device_id = 0
+
+    this._device_id = device_id
+
+    if not this.use_rmm and this._context is None:
+        # We must manually initialize the context in the absence of RMM
+        err, device = cuda.cuDeviceGet(this._device_id)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuDeviceGet failed with error {err}")
+
+        err, this._context = cuda.cuCtxCreate(0, device)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuCtxCreate failed with error {err}")
+
+
+def device_id() -> int:
+    initialize_cuda_context()
+    return this._device_id
--- a/python/cutlass/backend/init.py
+++ b/python/cutlass/backend/init.py
@@ -6,17 +6,11 @@ from cutlass.backend.epilogue import *
 from cutlass.backend.frontend import *
 from cutlass.backend.gemm_operation import *
 from cutlass.backend.library import *
-from cutlass.backend.memory_manager import PoolMemoryManager
+from cutlass.backend.memory_manager import PoolMemoryManager, create_memory_pool
 from cutlass.backend.operation import *
 from cutlass.backend.reduction_operation import *
 from cutlass.backend.type_hint import *
 from cutlass.backend.utils import *
 from cutlass.backend.utils.device import device_cc
-from cutlass.backend.utils.software import (
-    CheckPackages,
-    SubstituteTemplate,
-    device_sm_count,
-    get_memory_pool,
-)

 compiler = ArtifactManager()
--- a/python/cutlass/backend/arguments.py
+++ b/python/cutlass/backend/arguments.py
@@ -36,16 +36,10 @@ from typing import Union
 from cuda import cuda, cudart
 import numpy as np

+import cutlass
 from cutlass.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
-from cutlass.backend.utils.software import CheckPackages
-
-torch_available = CheckPackages().check_torch()
-if torch_available:
-    import torch
-
-cupy_available = CheckPackages().check_cupy()
-if cupy_available:
-    import cupy as cp
+from cutlass.backend.memory_manager import DevicePtrWrapper
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor


 class ArgumentBase:
@@ -76,7 +70,7 @@ class ArgumentBase:
        self.ptr_A = self.tensor_to_ptr(A, "A")
        self.ptr_B = self.tensor_to_ptr(B, "B")
        self.ptr_C = self.tensor_to_ptr(C, "C")
-        self.ptr_D = self.tensor_to_ptr(D, "D", True)
+        self.ptr_D = self.tensor_to_ptr(D, "D", is_output=True)
        if C is not None:
            if not isinstance(C, cuda.CUdeviceptr):
                self.tensor_c_numel = prod(C.shape)
@@ -88,18 +82,18 @@ class ArgumentBase:
        """
        if tensor is None:
            return cuda.CUdeviceptr(0)
-        if isinstance(tensor, np.ndarray):
+        if is_numpy_tensor(tensor):
            if is_output:
                assert name
            self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
            if is_output:
                self.host_tensors[name] = tensor
            return self.buffers[name].ptr
-        elif torch_available and isinstance(tensor, torch.Tensor):
+        elif is_torch_tensor(tensor):
            return TorchFrontend.argument(tensor)
        elif isinstance(tensor, cuda.CUdeviceptr):
            return tensor
-        elif cupy_available and isinstance(tensor, cp.ndarray):
+        elif is_cupy_tensor(tensor):
            return CupyFrontend.argument(tensor)
        else:
            raise TypeError("Unsupported Frontend. Only support numpy and torch")
@@ -119,3 +113,23 @@ class ArgumentBase:
            )
            if err != cuda.CUresult.CUDA_SUCCESS:
                raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.free()
+
+    def free(self):
+        """
+        Frees allocated device-side memory
+        """
+        # Free any device memory allocated manually
+        if not cutlass.use_rmm:
+            for name, buf in self.buffers.items():
+                if isinstance(buf, DevicePtrWrapper):
+                    err, = cudart.cudaFree(buf.ptr)
+                    if err != cudart.cudaError_t.cudaSuccess:
+                        raise RuntimeError(f"cudaFree failed with error {err}")
+
+            if hasattr(self, "workspace_buffer") and isinstance(self.workspace_buffer, DevicePtrWrapper):
+                err, = cudart.cudaFree(self.workspace_buffer.ptr)
+                if err != cudart.cudaError_t.cudaSuccess:
+                    raise RuntimeError(f"cudaFree failed with error {err}")
+                del self.workspace_buffer
--- a/python/cutlass/backend/c_types.py
+++ b/python/cutlass/backend/c_types.py
@@ -32,7 +32,7 @@

 import ctypes

-from cutlass import (
+from cutlass_library import (
    DataType,
    KernelScheduleType
 )
@@ -125,7 +125,7 @@ def get_mainloop_arguments_3x(
    Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.

    :param kernel_schedule: type of kernel schedule to be used in the mainloop
-    :type kerel_schedule: cutlass.KernelScheduleType
+    :type kernel_schedule: cutlass_library.KernelScheduleType
    :param element_A: data type of operand A
    :param element_B: data type of operand B
    :param alignment_A: alignment of operand A
@@ -166,25 +166,10 @@ def get_mainloop_arguments_3x(
                args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
            )

-    tma_alignment_bytes = 16
-    is_tma_aligned_A = ((DataTypeSizeBytes[element_A] * alignment_A) % tma_alignment_bytes) == 0
-    is_tma_aligned_B = ((DataTypeSizeBytes[element_B] * alignment_B) % tma_alignment_bytes) == 0
-    is_tma_aligned = is_tma_aligned_A and is_tma_aligned_B
-
-    if kernel_schedule == KernelScheduleType.Multistage:
-        return _MainloopArgumentsMultistage
-    elif kernel_schedule == KernelScheduleType.ScheduleAuto:
-        if is_tma_aligned:
-            return _MainloopArgumentsTma
-        else:
-            return _MainloopArgumentsMultistage
-    else:
-        if is_tma_aligned:
-            return _MainloopArgumentsTma
-        else:
-            raise Exception(f"Specified a kernel schedule using TMA ({kernel_schedule}), but "
-                            "the provided data types and alignments are not properly aligned for "
-                            "using TMA.")
+    # Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
+    # Should that become not the case, this is the place to return custom ctypes
+    # structures based on selected kernel schedule.
+    return _MainloopArgumentsTma


 def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor):
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@@ -38,12 +38,13 @@ import subprocess
 import tempfile

 from cuda import cuda, nvrtc
+from cutlass_library import SubstituteTemplate

-from cutlass import CACHE_FILE, CUDA_INSTALL_PATH, CUTLASS_PATH, logger
+import cutlass
+from cutlass import CACHE_FILE, CUTLASS_PATH, cuda_install_path, logger
 from cutlass.backend.gemm_operation import GemmOperationUniversal
 from cutlass.backend.library import ApiVersion
 from cutlass.backend.utils.device import device_cc
-from cutlass.backend.utils.software import SubstituteTemplate

 IncludeTemplate = r"""#include "${include}"
 """
@@ -316,7 +317,7 @@ class ArtifactManager:
            # compile with nvcc
            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
            values = {
-                "cuda_install_path": CUDA_INSTALL_PATH,
+                "cuda_install_path": cuda_install_path(),
                "options": compilation_options.get_str(),
                "srcfile": temp_cu.name,
                "tarfile": temp_cubin.name,
@@ -336,7 +337,7 @@ class ArtifactManager:
        cmd = SubstituteTemplate(
            cmd_template,
            {
-                "cuda_install_path": CUDA_INSTALL_PATH,
+                "cuda_install_path": cuda_install_path(),
                "options": host_compilation_options.get_str(),
            },
        )
@@ -356,18 +357,15 @@ class ArtifactManager:
        Insert a new compiled device module
        """
        include_paths = [
-            CUDA_INSTALL_PATH + "/include",
+            cuda_install_path() + "/include",
            CUTLASS_PATH + "/include",
            CUTLASS_PATH + "/tools/util/include",
            CUTLASS_PATH + "/python/cutlass/cpp/include",
        ]

-        if device_cc() is not None:
-            arch = device_cc()
-        else:
-            # Find the maximum arch tag among the provided operations and compile for that target.
-            # Since we are compiling to .cubin files, only one architecture may be specified.
-            arch = max([op.arch for op in operations])
+        cutlass.initialize_cuda_context()
+        arch = device_cc()
+
        host_compile_options = CompilationOptions(
            self._nvcc_compile_options, arch, include_paths)
        if compile_options is None:
--- a/python/cutlass/backend/conv2d_operation.py
+++ b/python/cutlass/backend/conv2d_operation.py
@@ -34,9 +34,10 @@ import ctypes
 from typing import Union

 from cuda import cuda
+from cutlass_library import SubstituteTemplate
 import numpy as np

-from cutlass import (
+from cutlass_library import (
    ConvKindNames,
    ConvKindTag,
    DataTypeNames,
@@ -71,13 +72,9 @@ from cutlass.backend.library import (
 )
 from cutlass.backend.memory_manager import device_mem_alloc
 from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
-from cutlass.backend.utils.datatypes import to_device_ptr
-from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
+from cutlass.backend.utils.device import to_device_ptr
 from cutlass.shape import GemmCoord

-if CheckPackages().check_torch():
-    import torch
-

 class Conv2dArguments(ArgumentBase):
    """
--- a/python/cutlass/backend/epilogue.py
+++ b/python/cutlass/backend/epilogue.py
@@ -32,14 +32,15 @@

 import ctypes

+from cutlass_library import SubstituteTemplate
 import numpy as np
 from scipy.special import erf

-from cutlass import DataType, DataTypeTag
+from cutlass_library import DataType, DataTypeTag
 from cutlass.backend.c_types import MatrixCoord_
 from cutlass.backend.frontend import NumpyFrontend
 from cutlass.backend.library import ActivationOp, ActivationOpTag
-from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
+from cutlass.utils.datatypes import is_numpy_tensor, is_torch_available, is_torch_tensor

 dtype2ctype = {
    DataType.f16: ctypes.c_uint16,
@@ -49,8 +50,7 @@ dtype2ctype = {
    DataType.s32: ctypes.c_int32
 }

-torch_available = CheckPackages().check_torch()
-if torch_available:
+if is_torch_available():
    import torch
    import torch.nn.functional as F

@@ -59,11 +59,11 @@ def get_scalar(value):
    """
    Returns a scalar value from a container (e.g., np.ndarray)
    """
-    if isinstance(value, np.ndarray):
+    if is_numpy_tensor(value):
        if value.size != 1:
            raise Exception("Scalars used in epilogue must be of size 1")
        return value.reshape(-1)[0]
-    elif CheckPackages().check_torch() and isinstance(value, torch.Tensor):
+    elif is_torch_tensor(value):
        if value.size != 1:
            raise Exception("Scalars used in epilogue must be of size 1")
        return value.reshape(-1)[0]
@@ -353,9 +353,9 @@ class ActivationFunctor:
 class ActivationMeta(type):
    @classmethod
    def __call__(cls, x, *args):
-        if isinstance(x, np.ndarray):
+        if is_numpy_tensor(x):
            return cls.numpy(x, *args)
-        elif torch_available and isinstance(x, torch.Tensor):
+        elif is_torch_tensor(x):
            return cls.torch(x, *args)
        else:
            raise NotImplementedError("Unsupported tensor type")
--- a/python/cutlass/backend/evt/backend/emitter_base.py
+++ b/python/cutlass/backend/evt/backend/emitter_base.py
@@ -34,7 +34,7 @@
 Base class for Epilogue Visitor Emitter
 """

-from cutlass import DataTypeTag
+from cutlass_library import DataTypeTag
 from cutlass.backend.evt.ir import TopoVisitorNode, DAGIR


--- a/python/cutlass/backend/evt/backend/sm80_nodes.py
+++ b/python/cutlass/backend/evt/backend/sm80_nodes.py
@@ -30,7 +30,7 @@
 #
 #################################################################################################

-from cutlass import DataTypeTag
+from cutlass_library import DataTypeSize, DataTypeTag

 from cutlass.backend.evt.ir import (
    # Load Node
--- a/python/cutlass/backend/evt/backend/sm90_emitter.py
+++ b/python/cutlass/backend/evt/backend/sm90_emitter.py
@@ -34,7 +34,7 @@
 Emitter for Sm90 Epilogue Visitor
 """

-from cutlass import DataTypeTag, EpilogueScheduleTag
+from cutlass_library import DataTypeTag, EpilogueScheduleTag
 from cutlass.backend import GemmOperationUniversal
 from cutlass.backend.evt.backend.emitter_base import FusionCallbacks

--- a/python/cutlass/backend/evt/backend/sm90_nodes.py
+++ b/python/cutlass/backend/evt/backend/sm90_nodes.py
@@ -32,7 +32,7 @@

 from pycute import product

-from cutlass import DataTypeSize, DataTypeTag
+from cutlass_library import DataTypeSize, DataTypeTag
 from cutlass.backend.evt.ir import (
    # Load Node
    AccumulatorImpl,
--- a/python/cutlass/backend/evt/epilogue.py
+++ b/python/cutlass/backend/evt/epilogue.py
@@ -37,12 +37,13 @@ Epilogue Visitor interface for compiling, and running visitor-based epilogue.
 import ctypes

 from cuda import cuda
+from cutlass_library import DataType
 import numpy as np

-from cutlass import DataType
 from cutlass.backend.epilogue import EpilogueFunctorBase
 import cutlass.backend.evt.backend
 from cutlass.backend.frontend import TensorFrontend
+from cutlass.utils.datatypes import is_numpy_tensor


 class EpilogueFunctorVisitor(EpilogueFunctorBase):
@@ -125,7 +126,7 @@ class EpilogueFunctorVisitor(EpilogueFunctorBase):
                # The tensor frontend returns a device buffer for np.ndarray
                # and device ptr for other frontends
                buffer_or_ptr = TensorFrontend.argument(tensor, is_output)
-                if isinstance(tensor, np.ndarray):
+                if is_numpy_tensor(tensor):
                    # Remember the host tensor for later synchronization
                    setattr(self, f"{tensor_name}_buffer", buffer_or_ptr)
                    setattr(self, f"{tensor_name}_host", tensor)
--- a/python/cutlass/backend/evt/frontend/frontend_base.py
+++ b/python/cutlass/backend/evt/frontend/frontend_base.py
@@ -36,7 +36,7 @@ Base class for Python EVT Frontend

 from typing import Union

-from cutlass import DataType
+from cutlass_library import DataType
 from cutlass.backend.evt.ir import (
    ComputeNode,
    DAGIR,
--- a/python/cutlass/backend/evt/frontend/python_ast.py
+++ b/python/cutlass/backend/evt/frontend/python_ast.py
@@ -38,8 +38,9 @@ import ast
 import inspect
 import textwrap

+from cutlass_library import DataType
+
 import cutlass
-from cutlass import DataType
 from cutlass.backend.evt.frontend.frontend_base import EVTFrontendBase
 from cutlass.backend.epilogue import relu
 from cutlass.backend.library import FunctionalOp
--- a/python/cutlass/backend/evt/ir/dag_ir.py
+++ b/python/cutlass/backend/evt/ir/dag_ir.py
@@ -36,7 +36,8 @@ DAG IR used by Python EVT

 import networkx as nx

-from cutlass import DataType
+from cutlass_library import DataType
+
 from cutlass.backend.evt.ir.node import NodeBase
 from cutlass.backend.utils import device_cc

--- a/python/cutlass/backend/evt/ir/layout_nodes.py
+++ b/python/cutlass/backend/evt/ir/layout_nodes.py
@@ -38,10 +38,10 @@ The layout Nodes change the layout of intermediate nodes in epilogue visitor gra

 from copy import deepcopy

+from cutlass_library import LayoutType
 from pycute import product, flatten

 import cutlass
-from cutlass import LayoutType
 from cutlass.backend.evt.ir.layout_algorithm import _list_to_tuple, _tuple_to_list
 from cutlass.backend.evt.ir.node import NodeBase
 from cutlass.backend.evt.ir.tensor import Tensor
--- a/python/cutlass/backend/evt/ir/node.py
+++ b/python/cutlass/backend/evt/ir/node.py
@@ -37,7 +37,8 @@ Base & visitor classes of DAGIR Nodes
 import ctypes
 from re import sub

-from cutlass import LayoutType
+from cutlass_library import LayoutType
+
 from cutlass.backend.evt.ir.layout_algorithm import _list_to_tuple, _reverse_tuple
 from cutlass.backend.evt.ir.tensor import Tensor

--- a/python/cutlass/backend/evt/ir/store_nodes.py
+++ b/python/cutlass/backend/evt/ir/store_nodes.py
@@ -36,7 +36,8 @@ Store node and implementations

 import ctypes

-from cutlass import DataType
+from cutlass_library import DataType
+
 from cutlass.backend.c_types import tuple_factory
 from cutlass.backend.epilogue import dtype2ctype, to_ctype_value
 from cutlass.backend.evt.ir.node import NodeBase, ImplBase, NoOpImpl
--- a/python/cutlass/backend/evt/ir/tensor.py
+++ b/python/cutlass/backend/evt/ir/tensor.py
@@ -34,7 +34,7 @@
 High-level class for tensor
 """

-from cutlass import LayoutType
+from cutlass_library import LayoutType

 from cutlass.backend.evt.ir.layout_algorithm import (
    Layout,
--- a/python/cutlass/backend/evt/passes/graph_drawer.py
+++ b/python/cutlass/backend/evt/passes/graph_drawer.py
@@ -32,9 +32,9 @@

 import subprocess

+from cutlass_library import DataTypeTag
 import pydot

-from cutlass import DataTypeTag
 from cutlass.backend.evt.ir.dag_ir import DAGIR


--- a/python/cutlass/backend/evt/passes/pass_preprocess_red.py
+++ b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
@@ -42,7 +42,6 @@ from cutlass.backend.evt.ir import ComputeNode, StoreNode
 from cutlass.backend.evt.passes.pass_manager import EVTPassBase


-
 class PassPreprocessRed(EVTPassBase):
    """
    Preprocess red nodes
--- a/python/cutlass/backend/evt/passes/smem_size_calculator.py
+++ b/python/cutlass/backend/evt/passes/smem_size_calculator.py
@@ -34,6 +34,7 @@
 Compute the shared memory size in bytes
 """

+import cutlass_library
 from pycute import shape_div, product

 import cutlass
@@ -56,10 +57,13 @@ class GetSmemSize:
    def sm90_epilogue_tile(self, tile_description):
        # Get the epilogue tile size
        schedule = tile_description.epilogue_schedule
-        if schedule == cutlass.EpilogueScheduleType.TmaWarpSpecialized:
+        if schedule == cutlass_library.EpilogueScheduleType.TmaWarpSpecialized:
            epilogue_tile_mn = (64, 32)
-        elif schedule == cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative:
-            epilogue_tile_mn = (128, 32)
+        elif schedule == cutlass_library.EpilogueScheduleType.TmaWarpSpecializedCooperative:
+            if tile_description.threadblock_shape[0] >= 128:
+                epilogue_tile_mn = (128, 32)
+            else:
+                epilogue_tile_mn = (64, 32)
        else:
            raise NotImplementedError(f"Unsupported schedule: {schedule}")

--- a/python/cutlass/backend/frontend.py
+++ b/python/cutlass/backend/frontend.py
@@ -34,15 +34,7 @@ from cuda import cuda
 import numpy as np

 from cutlass.backend.memory_manager import device_mem_alloc, todevice
-from cutlass.backend.utils.software import CheckPackages
-
-torch_available = CheckPackages().check_torch()
-if torch_available:
-    import torch
-
-cupy_available = CheckPackages().check_cupy()
-if cupy_available:
-    import cupy as cp
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor


 class NumpyFrontend:
@@ -97,6 +89,7 @@ class CupyFrontend:
    def argument(cupy_ndarray: "cp.ndarray"):
        return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))

+
 class TensorFrontend:
    """
    Universal Frontend for client-provide tensors
@@ -104,11 +97,11 @@ class TensorFrontend:

    @staticmethod
    def argument(tensor, is_output=False):
-        if isinstance(tensor, np.ndarray):
+        if is_numpy_tensor(tensor):
            return NumpyFrontend.argument(tensor, is_output)
-        elif torch_available and isinstance(tensor, torch.Tensor):
+        elif is_torch_tensor(tensor):
            return TorchFrontend.argument(tensor)
-        elif cupy_available and isinstance(tensor, cp.ndarray):
+        elif is_cupy_tensor(tensor):
            return CupyFrontend.argument(tensor)
        else:
            raise NotImplementedError("Unknown Tensor Type")
--- a/python/cutlass/backend/gemm_operation.py
+++ b/python/cutlass/backend/gemm_operation.py
@@ -35,10 +35,10 @@ import ctypes
 import enum

 from cuda import cuda, cudart
+from cutlass_library import SubstituteTemplate
 import numpy as np
-import rmm

-from cutlass import (
+from cutlass_library import (
    ComplexTransformTag,
    DataType,
    DataTypeNames,
@@ -96,11 +96,7 @@ from cutlass.backend.library import (
 from cutlass.backend.memory_manager import device_mem_alloc, todevice
 from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
 from cutlass.backend.type_hint import GemmOperation, Tensor
-from cutlass.backend.utils.software import (
-    CheckPackages,
-    SubstituteTemplate,
-    device_sm_count,
-)
+from cutlass.backend.utils.device import device_sm_count
 from cutlass.shape import GemmCoord, MatrixCoord


@@ -163,7 +159,7 @@ class GemmArguments2x(ArgumentBase):
    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray

    :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass.GemmUniversalMode`
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`

    :param output_op: output operator, optional
    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
@@ -387,7 +383,7 @@ class GemmArguments2xStreamK(GemmArguments2x):
    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray

    :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass.GemmUniversalMode`
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`

    :param output_op: output operator, optional
    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
@@ -426,9 +422,12 @@ class GemmArguments2xStreamK(GemmArguments2x):

    def initialize(self):
        # Get the host and device workspace
-        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(
+            self,
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )

-        device_workspace_size = 10 << 20
        if device_workspace_size > 0:
            self.workspace_buffer = device_mem_alloc(device_workspace_size)
            workspace_ptr = self.workspace_buffer.ptr
@@ -626,7 +625,7 @@ def GemmArguments(operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMo
    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray

    :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass.GemmUniversalMode`
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`

    :param output_op: output operator, optional
    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
@@ -1038,6 +1037,11 @@ extern "C" {
    typename GemmType::Params params(*args, device_sms, sm_occupancy);
    return params.get_grid_dims();
  }
+
+  uint64_t ${operation_name}_get_kernel_workspace_size(GemmType::Arguments* args, int device_sms, int sm_occupancy) {
+    typename GemmType::Params params(*args, device_sms, sm_occupancy);
+    return params.get_workspace_size();
+  }
 }
  """

@@ -1045,6 +1049,7 @@ extern "C" {
        super(GemmRTUniversalStreamK, self).__init__(operation)
        self.extra_funcs = {
            "get_grid_shape": GemmCoord_,
+            "get_kernel_workspace_size": ctypes.c_uint64,
        }
        self._occupancy = None
        self.argument_type, self.epilogue_type  = get_gemm_arguments_streamk(operation.epilogue_functor)
@@ -1062,6 +1067,9 @@ extern "C" {
                    f"{cuda.cuGetErrorString(err)[1]}")
        return self._occupancy

+    def get_device_workspace_size(self, arguments: GemmArguments2xStreamK, device_sms: int, sm_occupancy: int):
+        return self.get_kernel_workspace_size(ctypes.byref(arguments.get_arguments()), device_sms, sm_occupancy)
+

 ################################################################################
 # Runtime module for GEMM Universal within CUTLASS 3
@@ -1431,7 +1439,7 @@ ${operation_name}(${operation_name}${operation_suffix}::Params params) {
        problem_info_array = bytearray(problem_info.contents)

        # copy to device memory
-        return rmm.DeviceBuffer.to_device(problem_info_array).ptr
+        return todevice(problem_info_array).ptr

    def plan(self, arguments):
        return LaunchConfiguration(
@@ -1537,10 +1545,6 @@ class GemmOperationBase:

        return err

-    def free(self):
-        if hasattr(self, "workspace_buffer"):
-            del self.workspace_buffer
-
    def is_complex(self):
        complex_operators = [
            MathOperation.multiply_add_complex,
@@ -1627,7 +1631,7 @@ class GemmOperationBase:
            element_b=DataTypeNames[self.B.element],
            element_acc=DataTypeNames[self.tile_description.math_instruction.element_accumulator],
            element_c=DataTypeNames[self.C.element],
-            element_d=DataTypeNames[self.C.element],
+            element_d=DataTypeNames[self.epilogue_functor.element_output],
            core_name=self.core_name())
        return extended_name

--- a/python/cutlass/backend/library.py
+++ b/python/cutlass/backend/library.py
@@ -36,7 +36,7 @@ Common data types and string names/tags for them

 import enum

-from cutlass import (
+from cutlass_library import (
    ComplexTransform,
    DataType,
    DataTypeSize,
@@ -94,18 +94,6 @@ class DataTypeSizeBytes:
        return bits // 8


-SharedMemPerCC = {
-    70: 96 << 10,  # 96KB of SMEM
-    72: 96 << 10,  # 96KB of SMEM
-    75: 64 << 10,  # 64KB of SMEM
-    80: 160 << 10,  # 164KB of SMEM - 4KB reserved for the driver
-    86: 100 << 10,  # 100KB of SMEM
-    87: 160 << 10,  # 164KB of SMEM - 4KB reserved for the driver
-    89: 100 << 10,  # 100KB of SMEM
-    90: 227 << 10,  # 228KB of SMEM - 1KB reserved for the driver
-}
-
-
 class SchedulerMode(enum.Enum):
    Device = enum_auto()
    Host = enum_auto()
@@ -277,11 +265,11 @@ class TileDescription:
        :type math_instruction: MathInstruction
        :param cluster_shape: number of threadblocks in the [X, Y, Z] dimensions of a threadblock cluster
        :param kernel_schedule: type of kernel schedule to use (only available for SM90+)
-        :type kernel_schedule: cutlass.KernelScheduleType
+        :type kernel_schedule: cutlass_library.KernelScheduleType
        :param epilogue_schedule: type of epilogue schedule to use (only available for SM90+)
-        :type epilogue_schedule: cutlass.EpilogueScheduleType
+        :type epilogue_schedule: cutlass_library.EpilogueScheduleType
        :param tile_scheduler: type of tile scheduler to use (only available for SM90+)
-        :type tile_scheduler: cutlass.TileSchedulerType
+        :type tile_scheduler: cutlass_library.TileSchedulerType
        """
        if ((kernel_schedule is None and epilogue_schedule is not None) or
            (kernel_schedule is not None and epilogue_schedule is None)):
@@ -413,7 +401,10 @@ class TensorDescription:
    def __init__(self, element, layout, alignment=1, complex_transform=ComplexTransform.none):
        self.element = element
        self.layout = layout
-        self.alignment = min(128 // DataTypeSize[self.element], alignment)
+        if element != DataType.void:
+            self.alignment = min(128 // DataTypeSize[self.element], alignment)
+        else:
+            self.alignment = alignment
        self.complex_transform = complex_transform


@@ -473,9 +464,9 @@ def api_version(arch, opclass, dtype):
    :param arch: compute capability of device on which to run
    :type arch: int
    :param opclass: class of the operation being performed
-    :type opclass: cutlass.OpcodeClass
+    :type opclass: cutlass_library.OpcodeClass
    :param dtype: data type to be used in operation (assumes that ElementA and ElementB are the same)
-    :type dtype: cutlass.DataType
+    :type dtype: cutlass_library.DataType

    :return: API version to be used in code emission
    :rtype: ApiVersion
--- a/python/cutlass/backend/memory_manager.py
+++ b/python/cutlass/backend/memory_manager.py
@@ -31,7 +31,14 @@
 #################################################################################################

 import numpy as np
-import rmm
+
+import cutlass
+from cutlass.utils.datatypes import is_numpy_tensor
+
+if cutlass.use_rmm:
+    import rmm
+else:
+    from cuda import cudart


 class PoolMemoryManager:
@@ -44,31 +51,70 @@ class PoolMemoryManager:
        self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
        rmm.mr.set_current_device_resource(self.mr)

-    def get_allocated_size(self):
-        return self.mr.get_allocated_bytes()
-
    def pool_size(self):
        return self.pool.pool_size()


+class DevicePtrWrapper:
+    """
+    Wrapper around a pointer to device memory to provide a uniform interface with the RMM DeviceBuffer
+    (at least in terms of the interface used by the CUTLASS Python interface)
+    """
+    def __init__(self, dev_ptr):
+        self.dev_ptr = dev_ptr
+
+    @property
+    def ptr(self):
+        return self.dev_ptr
+
+
+def _todevice(host_data):
+    """
+    Helper for transferring host data to device memory
+    """
+    if cutlass.use_rmm:
+        return rmm.DeviceBuffer.to_device(host_data.tobytes())
+    else:
+        nbytes = len(host_data.tobytes())
+        dev_ptr_wrapper = device_mem_alloc(nbytes)
+        err, = cudart.cudaMemcpy(
+            dev_ptr_wrapper.ptr,
+            host_data.__array_interface__['data'][0],
+            nbytes,
+            cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+        )
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise Exception(f"cudaMemcpy failed with error {err}")
+        return dev_ptr_wrapper
+
+
 def todevice(host_data, dtype=np.float32):
    """
    Pass the host_data to device memory
    """
    if isinstance(host_data, list):
-        return rmm.DeviceBuffer.to_device(np.array(host_data, dtype=dtype).tobytes())
-    elif isinstance(host_data, np.ndarray):
-        return rmm.DeviceBuffer.to_device(host_data.tobytes())
+        return _todevice(np.array(host_data, dtype=dtype))
+    elif is_numpy_tensor(host_data):
+        return _todevice(host_data)


 def device_mem_alloc(size):
-    return rmm.DeviceBuffer(size=size)
+    if cutlass.use_rmm:
+        return rmm.DeviceBuffer(size=size)
+    else:
+        err, ptr = cudart.cudaMalloc(size)
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise Exception(f"cudaMalloc failed with error {err}")
+        return DevicePtrWrapper(ptr)


 def align_size(size, alignment=256):
    return ((size + alignment - 1) // alignment) * alignment


-def get_allocated_size():
-    device_resource = rmm.mr.get_current_device_resource()
-    return device_resource.get_allocated_bytes()
+def create_memory_pool(init_pool_size=0, max_pool_size=2 ** 34):
+    if cutlass.use_rmm:
+        memory_pool = PoolMemoryManager(init_pool_size=init_pool_size, max_pool_size=max_pool_size)
+        return memory_pool
+    else:
+        return None
--- a/python/cutlass/backend/operation.py
+++ b/python/cutlass/backend/operation.py
@@ -37,9 +37,15 @@ from cuda import __version__, cuda
 from cutlass.backend.utils.device import device_cc

 _version_splits = [int(x) for x in __version__.split("rc")[0].split(".")]
-supports_cluster_launch = device_cc() >= 90 and (
-    _version_splits[0] > 11 or (_version_splits[0] == 11 and _version_splits[1] >= 8)
-)
+_supports_cluster_launch = None
+
+
+def supports_cluster_launch():
+    global _supports_cluster_launch
+    if _supports_cluster_launch is None:
+        major, minor = _version_splits[0], _version_splits[1]
+        _supports_cluster_launch = device_cc() >= 90 and (major > 11 or (major == 11 and minor >= 8))
+    return _supports_cluster_launch


 class LaunchConfiguration:
@@ -121,7 +127,7 @@ class ExecutableOperation:
        packed = (ctypes.c_void_p * 1)()
        packed[0] = ctypes.addressof(cArg)

-        if supports_cluster_launch:
+        if supports_cluster_launch():
            return self.run_with_clusters(launch_config, packed, stream)
        else:
            return self.run_without_clusters(launch_config, packed, stream)
--- a/python/cutlass/backend/reduction_operation.py
+++ b/python/cutlass/backend/reduction_operation.py
@@ -36,21 +36,22 @@ from typing import Union
 from cuda import cuda, cudart
 import numpy as np

-from cutlass import (
+from cutlass_library import (
    DataTypeNames,
    DataTypeSize,
    DataTypeTag,
-    LayoutType
+    LayoutType,
+    SubstituteTemplate
 )
+
+import cutlass
 from cutlass.backend.c_types import MatrixCoord_, TensorRef2D_, get_reduction_params
 from cutlass.backend.frontend import NumpyFrontend, TorchFrontend
 from cutlass.backend.library import TensorDescription
+from cutlass.backend.memory_manager import DevicePtrWrapper
 from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
-from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
 from cutlass.shape import MatrixCoord
-
-if CheckPackages().check_torch():
-    import torch
+from cutlass.utils.datatypes import is_numpy_tensor, is_torch_tensor


 class ReductionOperation:
@@ -85,13 +86,13 @@ class ReductionArguments:
        # number of split-k partitions
        self.partitions = partitions

-        if isinstance(destination, np.ndarray):
+        if is_numpy_tensor(destination):
            self.host_D = destination
            self.destination_buffer = NumpyFrontend.argument(destination, True)
            self.source_buffer = NumpyFrontend.argument(source, False)
            self.ptr_destination = cuda.CUdeviceptr(self.destination_buffer.ptr)
            self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
-        elif CheckPackages().check_torch() and isinstance(destination, torch.Tensor):
+        elif is_torch_tensor(destination):
            self.ptr_destination = TorchFrontend.argument(destination)
            self.ptr_source = TorchFrontend.argument(source)
        elif isinstance(destination, cuda.CUdeviceptr):
@@ -185,11 +186,22 @@ class ReductionArguments:
            if err != cuda.CUresult.CUDA_SUCCESS:
                raise RuntimeError("CUDA Error %s" % str(err))

+        self.free()
+
    def free(self):
-        if hasattr(self, "destination_buffer"):
-            del self.destination_buffer
-        if hasattr(self, "source_buffer"):
-            del self.source_buffer
+        """
+        Frees allocated device-side memory
+        """
+        # Free any device memory allocated manually
+        if not cutlass.use_rmm:
+            for attr in ["destination_buffer", "source_buffer"]:
+                if hasattr(self, attr):
+                    buf = getattr(self, attr)
+                    if isinstance(buf, DevicePtrWrapper):
+                        err, = cudart.cudaFree(buf.ptr)
+                        if err != cudart.cudaError_t.cudaSuccess:
+                            raise RuntimeError(f"cudaFree failed with error {err}")
+                        del buf


 class ReductionRT(ExecutableOperation):
--- a/python/cutlass/backend/utils/init.py
+++ b/python/cutlass/backend/utils/init.py
@@ -30,11 +30,4 @@
 #
 ################################################################################

-from cutlass.backend.utils.datatypes import *
 from cutlass.backend.utils.device import check_cuda_errors, device_cc
-from cutlass.backend.utils.software import (
-    CheckPackages,
-    SubstituteTemplate,
-    device_sm_count,
-    get_memory_pool,
-)
--- a/python/cutlass/backend/utils/datatypes.py
+++ b/python/cutlass/backend/utils/datatypes.py
@@ -1,156 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Utility functions for converting between frontend datatypes and CUTLASS datatypes
-"""
-
-from cuda import cuda
-
-from cutlass import DataType
-from cutlass.backend.utils.software import CheckPackages
-
-numpy_available = CheckPackages().check_numpy()
-if numpy_available:
-    import numpy as np
-
-    numpy_to_cutlass_dict = {
-        np.float16: DataType.f16,
-        np.float32: DataType.f32,
-        np.float64: DataType.f64,
-        np.int8: DataType.s8,
-        np.int32: DataType.s32,
-        np.dtype('float16'): DataType.f16,
-        np.dtype('float32'): DataType.f32,
-        np.dtype('float64'): DataType.f64,
-        np.dtype('int8'): DataType.s8,
-        np.dtype('int32'): DataType.s32,
-    }
-
-
-def numpy_to_cutlass(inp):
-    numpy_available = CheckPackages().check_numpy()
-    if numpy_available:
-        return numpy_to_cutlass_dict.get(inp, None)
-
-
-cupy_available = CheckPackages().check_cupy()
-if cupy_available:
-    import cupy as cp
-
-    cupy_to_cutlass_dict = {
-        cp.float16: DataType.f16,
-        cp.float32: DataType.f32,
-        cp.float64: DataType.f64,
-    }
-
-
-def cupy_to_cutlass(inp):
-    cupy_available = CheckPackages().check_cupy()
-    if cupy_available:
-        return cupy_to_cutlass_dict.get(inp, None)
-
-
-torch_available = CheckPackages().check_torch()
-if torch_available:
-    import torch
-
-    torch_to_cutlass_dict = {
-        torch.half: DataType.f16,
-        torch.float16: DataType.f16,
-        torch.float: DataType.f32,
-        torch.float32: DataType.f32,
-        torch.double: DataType.f64,
-        torch.float64: DataType.f64,
-    }
-
-
-def torch_to_cutlass(inp):
-    if torch_available:
-        return torch_to_cutlass_dict.get(inp, None)
-
-
-try:
-    import bfloat16
-
-    bfloat16_available = True
-    numpy_to_cutlass_dict[np.dtype(bfloat16.bfloat16)] = DataType.bf16
-except ImportError:
-    bfloat16_available = False
-
-
-def bfloat16_to_cutlass(inp):
-    if bfloat16_available:
-        if inp == bfloat16.bfloat16:
-            return DataType.bf16
-
-
-def to_cutlass(inp):
-    for cvt_fn in [
-        bfloat16_to_cutlass,
-        cupy_to_cutlass,
-        numpy_to_cutlass,
-        torch_to_cutlass,
-    ]:
-        out = cvt_fn(inp)
-        if out is not None:
-            return out
-
-    raise Exception(
-        "No available conversion from type {} to a CUTLASS type.".format(inp)
-    )
-
-
-def to_device_ptr(tensor) -> cuda.CUdeviceptr:
-    """
-    Converts a tensor to a CUdeviceptr
-
-    :param tensor: tensor to convert
-    :type tensor: np.ndarray | torch.Tensor | cp.ndarray | int
-
-    :return: device pointer
-    :rtype: cuda.CUdeviceptr
-    """
-    if isinstance(tensor, np.ndarray):
-        ptr = cuda.CUdeviceptr(tensor.__array_interface__["data"][0])
-    elif torch_available and isinstance(tensor, torch.Tensor):
-        ptr = cuda.CUdeviceptr(tensor.data_ptr())
-    elif cupy_available and isinstance(tensor, cp.ndarray):
-        ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
-    elif isinstance(tensor, cuda.CUdeviceptr):
-        ptr = tensor
-    elif isinstance(tensor, int):
-        ptr = cuda.CUdeviceptr(tensor)
-    else:
-        raise NotImplementedError(tensor)
-
-    return ptr
--- a/python/cutlass/backend/utils/device.py
+++ b/python/cutlass/backend/utils/device.py
@@ -34,7 +34,10 @@
 Utility functions for interacting with the device
 """

-from cuda import cudart
+from cuda import cuda, cudart
+
+import cutlass
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor


 def check_cuda_errors(result: list):
@@ -60,7 +63,7 @@ def check_cuda_errors(result: list):
        return result[1:]


-def device_cc(device: int = 0) -> int:
+def device_cc(device: int = -1) -> int:
    """
    Returns the compute capability of the device with ID `device`.

@@ -70,7 +73,51 @@ def device_cc(device: int = 0) -> int:
    :return: compute capability of the queried device (e.g., 80 for SM80)
    :rtype: int
    """
+    if device == -1:
+        device = cutlass.device_id()
+
    deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
    major = str(deviceProp.major)
    minor = str(deviceProp.minor)
    return int(major + minor)
+
+
+def device_sm_count(device: int = -1):
+    if device == -1:
+        device = cutlass.device_id()
+    err, device_sm_count = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
+    )
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        raise Exception(
+            "Failed to retireve SM count. "
+            f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}"
+        )
+
+    return device_sm_count
+
+
+def to_device_ptr(tensor) -> cuda.CUdeviceptr:
+    """
+    Converts a tensor to a CUdeviceptr
+
+    :param tensor: tensor to convert
+    :type tensor: np.ndarray | torch.Tensor | cp.ndarray | int
+
+    :return: device pointer
+    :rtype: cuda.CUdeviceptr
+    """
+    if is_numpy_tensor(tensor):
+        ptr = cuda.CUdeviceptr(tensor.__array_interface__["data"][0])
+    elif is_torch_tensor(tensor):
+        ptr = cuda.CUdeviceptr(tensor.data_ptr())
+    elif is_cupy_tensor(tensor):
+        ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
+    elif isinstance(tensor, cuda.CUdeviceptr):
+        ptr = tensor
+    elif isinstance(tensor, int):
+        ptr = cuda.CUdeviceptr(tensor)
+    else:
+        raise NotImplementedError(tensor)
+
+    return ptr
--- a/python/cutlass/backend/utils/software.py
+++ b/python/cutlass/backend/utils/software.py
@@ -1,111 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import re
-import sys
-
-from cutlass.backend.memory_manager import PoolMemoryManager
-
-
-class CheckPackages:
-    def __init__(self) -> None:
-        pass
-
-    def check_cupy(self):
-        if "cupy" in sys.modules:
-            return True
-        else:
-            try:
-                import cupy
-
-                cupy_available = True
-            except ImportError:
-                print("cupy is not loaded.")
-
-    def check_numpy(self):
-        if "numpy" in sys.modules:
-            return True
-        else:
-            try:
-                import numpy
-
-                numpy_available = True
-            except ImportError:
-                print("numpy is not loaded.")
-
-    def check_torch(self):
-        if "torch" in sys.modules:
-            return True
-        else:
-            try:
-                import torch
-
-                torch_available = True
-            except ImportError:
-                print("torch is not loaded.")
-
-
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-
-
-def device_sm_count():
-    from cuda import cuda
-
-    _device = 0
-    err, _device_sm_count = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, _device
-    )
-    if err != cuda.CUresult.CUDA_SUCCESS:
-        raise Exception(
-            "Failed to retireve SM count. "
-            f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}"
-        )
-
-    return _device_sm_count
-
-
-def get_memory_pool(init_pool_size=0, max_pool_size=2 ** 34):
-    memory_pool = PoolMemoryManager(
-        init_pool_size=init_pool_size, max_pool_size=max_pool_size
-    )
-    return memory_pool
--- a/python/cutlass/emit/pytorch.py
+++ b/python/cutlass/emit/pytorch.py
@@ -39,7 +39,7 @@ Example usage with JIT compilation:
 .. highlight:: python
 .. code-block:: python

-    plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass.LayoutType.RowMajor)
+    plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass_library.LayoutType.RowMajor)
    op = plan.construct()
    mod = cutlass.emit.pytorch(op, 'cutlass_gemm', 80, jit=True)

@@ -81,15 +81,16 @@ The module can later be used in Python via:
 import logging
 import os

-from cutlass import CUTLASS_PATH, logger, swizzle, ConvKind, ConvKindNames, DataType
+from cutlass_library import ConvKind, ConvKindNames, DataType, SubstituteTemplate
+
+from cutlass import CUTLASS_PATH, logger, swizzle
 from cutlass.backend.gemm_operation import GemmOperationGrouped, GemmOperationUniversal
 from cutlass.backend.conv2d_operation import Conv2dOperation
 from cutlass.backend.library import ApiVersion
-from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
 from cutlass.emit import common
+from cutlass.utils.datatypes import is_torch_available

-torch_available = CheckPackages().check_torch()
-if torch_available:
+if is_torch_available():
    import torch


--- a/python/cutlass/epilogue/evt_ops.py
+++ b/python/cutlass/epilogue/evt_ops.py
@@ -36,10 +36,9 @@ Collection of builtin functions used for host reference in EVT

 import numpy as np

-from cutlass.backend.utils.software import CheckPackages
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_available, is_torch_tensor

-torch_available = CheckPackages().check_torch()
-if torch_available:
+if is_torch_available():
    import torch


@@ -48,16 +47,16 @@ def multiply_add(x, y, z):


 def sum(x, dim):
-    if isinstance(x, np.ndarray):
+    if is_numpy_tensor(x):
        return x.sum(axis=tuple(dim))
-    elif torch_available and isinstance(x, torch.Tensor):
+    elif is_torch_tensor(x):
        return torch.sum(x, dim)


 def max(x, dim):
-    if isinstance(x, np.ndarray):
+    if is_numpy_tensor(x):
        return x.max(axis=tuple(dim))
-    elif torch_available and isinstance(x, torch.Tensor):
+    elif is_torch_tensor(x):
        return torch.amax(x, dim)


@@ -66,14 +65,14 @@ def max(x, dim):
 ##############################################################################

 def permute(x, indices: tuple):
-    if isinstance(x, np.ndarray):
+    if is_numpy_tensor(x):
        return np.transpose(x, axes=indices)
-    elif torch_available and isinstance(x, torch.Tensor):
+    elif is_torch_tensor(x):
        return x.permute(*indices)


 def reshape(x, new_shape: tuple):
-    if isinstance(x, np.ndarray):
+    if is_numpy_tensor(x):
        return np.reshape(x, newshape=new_shape)
-    elif torch_available and isinstance(x, torch.Tensor):
+    elif is_torch_tensor(x):
        return x.view(new_shape)
--- a/python/cutlass/library_defaults.py
+++ b/python/cutlass/library_defaults.py
@@ -69,20 +69,23 @@ class KernelsForDataType:
        """
        Add an operation to the list of supported kernels
        """
-        alignment = operation.A.alignment
-        if alignment not in self.kernels_by_alignment:
-            self.kernels_by_alignment[alignment] = []
-        self.kernels_by_alignment[alignment].append(operation)
+        alignment_key = f"{operation.A.alignment} {operation.B.alignment} {operation.C.alignment}"
+        if alignment_key not in self.kernels_by_alignment:
+            self.kernels_by_alignment[alignment_key] = []
+        self.kernels_by_alignment[alignment_key].append(operation)

-    @property
-    def alignments(self):
+    def alignments(self, operand: str):
        """
        Returns an unsorted list of alignments supported by this data type combination

+        :param operand: identifier of operand in question (e.g., A, B, C)
+        :type operand: str
+
        :return: unsorted list of alignments supported by this data type combination
        :rtype: list
        """
-        return list(self.kernels_by_alignment.keys())
+        operand_idx = self._operand_idx(operand)
+        return [int(key.split(" ")[operand_idx]) for key in self.kernels_by_alignment.keys()]

    @property
    def all_operations(self):
@@ -97,24 +100,48 @@ class KernelsForDataType:
            ops.extend(alignment_ops)
        return ops

-    def operations(self, alignment: int):
-        """
-        Returns operations satisfying the alignment constraint indicated by `alignment`
+    def default_operation(self):
+        key = sorted(list(self.kernels_by_alignment.keys()))[0]
+        return self.kernels_by_alignment[key][0]

-        :param alignment: alignment constraint of operations to return
-        :type alignment: int
+    def operations(self, alignment_A: int, alignment_B: int, alignment_C: int):
+        """
+        Returns operations satisfying the alignment constraints
+
+        :param alignment_A: alignment constraint of operations to return
+        :type alignment_A: int
+        :param alignment_B: alignment constraint of operations to return
+        :type alignment_B: int
+        :param alignment_C: alignment constraint of operations to return
+        :type alignment_C: int

        :return: list of operations
        :rtype: list
        """
-        if alignment not in self.kernels_by_alignment:
-            raise Exception(
-                f"No operations of alignment {alignment} found for data type and layout "
-                f"combination {self.datatype_comb} {self.layout_comb}"
-            )
-        return self.kernels_by_alignment[alignment]
+        key = f"{alignment_A} {alignment_B} {alignment_C}"

-    def find_alignment(self, shape: tuple, layout: cutlass.LayoutType) -> int:
+        if key not in self.kernels_by_alignment:
+            og_key = key
+            # Reconcile A, B, and C alignments by trying to align to the minimum
+            min_alignment = min(alignment_A, alignment_B, alignment_C)
+            key = f"{min_alignment} {min_alignment} {min_alignment}"
+            if key not in self.kernels_by_alignment:
+                raise Exception(
+                    f"No operations of alignment {og_key} found for data type and layout "
+                    f"combination {self.datatype_comb} {self.layout_comb}. Tried to fall back "
+                    f"to alignment {key}, but that was also not compatible. Compatible alignments "
+                    f"are {self.kernels_by_alignment.keys()}"
+                )
+        return self.kernels_by_alignment[key]
+
+    def _operand_idx(self, key: str) -> int:
+        operand_list = ["A", "B", "C"]
+        if key not in operand_list:
+            raise Exception(f"Unexpected operand {operand}")
+
+        return operand_list.index(key)
+
+    def find_alignment(self, shape: tuple, layout: cutlass.LayoutType, operand=str) -> int:
        """
        Returns the most preferable alignment for a given shape and layout

@@ -122,10 +149,14 @@ class KernelsForDataType:
        :type shape: tuple
        :param layout: layout of the tensor
        :type layout: cutlass.LayoutType
+        :param operand: descriptor of the operand in question
+        :type operand: str

        :return: maximum alignment supported by the data type combination and tensor size
        :rtype: int
        """
+        operand_idx = self._operand_idx(operand)
+
        # Determine the leading dimension of the shape
        if layout == cutlass.LayoutType.ColumnMajor:
            ld = shape[-2]
@@ -136,7 +167,8 @@ class KernelsForDataType:
        else:
            raise Exception(f"Unexpected or unsupported layout {layout}")

-        for alignment in sorted(list(self.kernels_by_alignment.keys()), reverse=True):
+        for alignments in sorted(list(self.kernels_by_alignment.keys()), reverse=True):
+            alignment = int(alignments.split(" ")[operand_idx])
            if ld % alignment == 0:
                return alignment

@@ -165,7 +197,7 @@ class ArchOptions:
    :param kernel_cc: compute capability of the kernels to generate
    :type kernel_cc: int
    :param operation_kind: type of operation to register
-    :type operation_kind: cutlass.OperationKind
+    :type operation_kind: cutlass_library.OperationKind
    :param gemm_kinds: types of GEMM operations that can be included
    :type gemm_kinds: list
    :param allowed_math_operations: types of primitive math operations allowed
@@ -176,11 +208,12 @@ class ArchOptions:
        self,
        target_cc: int,
        kernel_cc: int,
-        operation_kind: cutlass.OperationKind,
+        operation_kind: cutlass_library.OperationKind,
        gemm_kinds: list,
        allowed_math_operations: list = [
-            cutlass.MathOperation.multiply_add,
-            cutlass.MathOperation.multiply_add_saturate,
+            cutlass_library.MathOperation.multiply_add,
+            cutlass_library.MathOperation.multiply_add_saturate,
+            cutlass_library.MathOperation.multiply_add_mixed_input_upcast
        ]
    ):
        self.cc = kernel_cc
@@ -229,7 +262,7 @@ class ArchOptions:
        # find available opclasses and data types
        for name, op_list in manifest.operations[operation_kind][kernel_cc].items():
            for op in op_list:
-                if operation_kind == cutlass.OperationKind.Gemm:
+                if operation_kind == cutlass_library.OperationKind.Gemm:
                    if op.gemm_kind not in gemm_kinds:
                        continue

@@ -237,15 +270,11 @@ class ArchOptions:
                if mi.math_operation not in self.allowed_math_operations:
                    continue

-                if op.C.element == cutlass.DataType.void:
-                    # The CUTLASS Python interface currently does not support void-C kernels
-                    continue
-
                datatype_comb = (mi.element_a, mi.element_b, mi.element_accumulator)

                # Prune operations that don't fit in shared memory
                td = td_from_profiler_op(op)
-                if not valid_stage_count(target_cc, kernel_cc, td)[0]:
+                if not valid_stage_count(target_cc, kernel_cc, td, verbose=False)[0]:
                    continue

                if mi.opcode_class not in self.operations_by_opclass:
@@ -255,17 +284,17 @@ class ArchOptions:
                layout_comb = (op.A.layout, op.B.layout)

                # Register TF32 kernels as F32 to enable F32 -> TF32 conversion + TF32 Tensor Core operations
-                if datatype_comb == (cutlass.DataType.tf32, cutlass.DataType.tf32, cutlass.DataType.f32):
+                if datatype_comb == (cutlass_library.DataType.tf32, cutlass_library.DataType.tf32, cutlass_library.DataType.f32):
                    # TF32 kernels only supported on SM80 and beyond
                    if self.cc < 80:
                        continue
                    elif self.cc == 90:
-                        if (op.A.element != cutlass.DataType.f32
-                            or op.B.element != cutlass.DataType.f32
-                            or op.C.element != cutlass.DataType.f32):
+                        if (op.A.element != cutlass_library.DataType.f32
+                            or op.B.element != cutlass_library.DataType.f32
+                            or op.C.element != cutlass_library.DataType.f32):
                            continue

-                    datatype_comb = (cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32)
+                    datatype_comb = (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32)

                opclass_dict = self.operations_by_opclass[mi.opcode_class]
                key = (datatype_comb, layout_comb)
@@ -274,82 +303,82 @@ class ArchOptions:
                opclass_dict[key].add(op)

        # Set the default opclass to TensorOp, if available. Otherwise default to SIMT
-        if cutlass.OpcodeClass.TensorOp in self.operations_by_opclass:
-            self.op_class = cutlass.OpcodeClass.TensorOp
+        if cutlass_library.OpcodeClass.TensorOp in self.operations_by_opclass:
+            self.op_class = cutlass_library.OpcodeClass.TensorOp
        else:
-            self.op_class = cutlass.OpcodeClass.Simt
+            self.op_class = cutlass_library.OpcodeClass.Simt

        # The profiler's generator may generate only a limited set of combinations of operands for SIMT kernels.
        # Here, we generate additional versions via a generic TileDescription.
-        if cutlass.OpcodeClass.Simt not in self.operations_by_opclass:
-            self.operations_by_opclass[cutlass.OpcodeClass.Simt] = {}
+        if cutlass_library.OpcodeClass.Simt not in self.operations_by_opclass:
+            self.operations_by_opclass[cutlass_library.OpcodeClass.Simt] = {}

-        if operation_kind == cutlass.OperationKind.Gemm:
+        if operation_kind == cutlass_library.OperationKind.Gemm:
            types = [
-                (cutlass.DataType.s8, cutlass.DataType.s8, cutlass.DataType.s8),
-                (cutlass.DataType.s8, cutlass.DataType.s8, cutlass.DataType.s32),
-                (cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16),
-                (cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32),
-                (cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32),
-                (cutlass.DataType.f64, cutlass.DataType.f64, cutlass.DataType.f64),
+                (cutlass_library.DataType.s8, cutlass_library.DataType.s8, cutlass_library.DataType.s8),
+                (cutlass_library.DataType.s8, cutlass_library.DataType.s8, cutlass_library.DataType.s32),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f16),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f64, cutlass_library.DataType.f64, cutlass_library.DataType.f64),
            ]

            layouts = [
-                (cutlass.LayoutType.RowMajor, cutlass.LayoutType.RowMajor),
-                (cutlass.LayoutType.RowMajor, cutlass.LayoutType.ColumnMajor),
-                (cutlass.LayoutType.ColumnMajor, cutlass.LayoutType.RowMajor),
-                (cutlass.LayoutType.ColumnMajor, cutlass.LayoutType.ColumnMajor),
+                (cutlass_library.LayoutType.RowMajor, cutlass_library.LayoutType.RowMajor),
+                (cutlass_library.LayoutType.RowMajor, cutlass_library.LayoutType.ColumnMajor),
+                (cutlass_library.LayoutType.ColumnMajor, cutlass_library.LayoutType.RowMajor),
+                (cutlass_library.LayoutType.ColumnMajor, cutlass_library.LayoutType.ColumnMajor),
            ]
-        elif operation_kind == cutlass.OperationKind.Conv2d:
+        elif operation_kind == cutlass_library.OperationKind.Conv2d:
            types = [
-                (cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16),
-                (cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32),
-                (cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32),
-                (cutlass.DataType.f64, cutlass.DataType.f64, cutlass.DataType.f64),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f16),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f64, cutlass_library.DataType.f64, cutlass_library.DataType.f64),
            ]

            layouts = [
-                (cutlass.LayoutType.TensorNHWC, cutlass.LayoutType.TensorNHWC),
+                (cutlass_library.LayoutType.TensorNHWC, cutlass_library.LayoutType.TensorNHWC),
            ]
        else:
            raise NotImplementedError(f"Operation kind {operation_kind} is currently unsupported.")

        alignment = 1
-        epilogue_functor = cutlass.EpilogueFunctor.LinearCombination
-        swizzling_functor = cutlass.SwizzlingFunctor.Identity8
+        epilogue_functor = cutlass_library.EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass_library.SwizzlingFunctor.Identity8
        for type_comb in types:
            for layout_comb in layouts:
                comb = (type_comb, layout_comb)
-                if comb in self.operations_by_opclass[cutlass.OpcodeClass.Simt]:
+                if comb in self.operations_by_opclass[cutlass_library.OpcodeClass.Simt]:
                    continue

-                A = cutlass.TensorDescription(type_comb[0], layout_comb[0], alignment)
-                B = cutlass.TensorDescription(type_comb[1], layout_comb[1], alignment)
-                C = cutlass.TensorDescription(type_comb[2], cutlass.LayoutType.ColumnMajor, alignment)
-                math_inst = cutlass.MathInstruction(
+                A = cutlass_library.TensorDescription(type_comb[0], layout_comb[0], alignment)
+                B = cutlass_library.TensorDescription(type_comb[1], layout_comb[1], alignment)
+                C = cutlass_library.TensorDescription(type_comb[2], cutlass_library.LayoutType.ColumnMajor, alignment)
+                math_inst = cutlass_library.MathInstruction(
                    [1, 1, 1],
                    type_comb[0],
                    type_comb[1],
                    type_comb[2],
-                    cutlass.OpcodeClass.Simt,
-                    cutlass.MathOperation.multiply_add
+                    cutlass_library.OpcodeClass.Simt,
+                    cutlass_library.MathOperation.multiply_add
                )

-                td = cutlass.TileDescription(
+                td = cutlass_library.TileDescription(
                    [128, 128, 8], 2, [4, 2, 1], math_inst, 50, 1024)

                # Prune operations that don't fit in shared memory
-                if not valid_stage_count(target_cc, kernel_cc, td_from_profiler_td(td))[0]:
+                if not valid_stage_count(target_cc, kernel_cc, td_from_profiler_td(td), verbose=False)[0]:
                    continue

                new_kernels = KernelsForDataType(type_comb, layout_comb)

-                if operation_kind == cutlass.OperationKind.Gemm:
+                if operation_kind == cutlass_library.OperationKind.Gemm:
                    new_operation = cutlass_library.manifest.GemmOperation(
-                        cutlass.GemmKind.Universal, td.minimum_compute_capability,
+                        cutlass_library.GemmKind.Universal, td.minimum_compute_capability,
                        td, A, B, C, type_comb[2], epilogue_functor, swizzling_functor)
                    new_kernels.add(new_operation)
-                elif operation_kind == cutlass.OperationKind.Conv2d:
+                elif operation_kind == cutlass_library.OperationKind.Conv2d:
                    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
                        new_operation = cutlass_library.manifest.Conv2dOperation(
                            conv_kind, IteratorAlgorithm.Analytic, td.minimum_compute_capability, td,
@@ -358,7 +387,7 @@ class ArchOptions:
                        )
                        new_kernels.add(new_operation)

-                self.operations_by_opclass[cutlass.OpcodeClass.Simt][comb] = new_kernels
+                self.operations_by_opclass[cutlass_library.OpcodeClass.Simt][comb] = new_kernels

        # Sort all operations
        for oc in self.operations_by_opclass.keys():
@@ -366,17 +395,17 @@ class ArchOptions:
                self.operations_by_opclass[oc][comb].sort()

    def opclass_supports_combination(
-        self, op_class: cutlass.OpcodeClass, datatype_comb: tuple, layout_comb: tuple
+        self, op_class: cutlass_library.OpcodeClass, datatype_comb: tuple, layout_comb: tuple
    ) -> bool:
        """
        Returns whether the provided operation class supports the provided data type and layout combination

        :param op_class: operation class to consider
-        :type op_class: cutlass.OpcodeClass
+        :type op_class: cutlass_library.OpcodeClass
        :param datatype_comb: tuple of data types for (element_A, element_B, element_accumulator)
-        :type datatype_comb: tuple[cutlass.DataType]
+        :type datatype_comb: tuple[cutlass_library.DataType]
        :param layout_comb: tuple of data types for (layout_A, layout_B)
-        :type layout_comb: tuple[cutlass.LayoutType]
+        :type layout_comb: tuple[cutlass_library.LayoutType]

        :return: set of operation classes that support the provided data type and layout combination
        :rtype: set
@@ -388,25 +417,25 @@ class ArchOptions:

    def supporting_opclasses(
        self,
-        element_a: cutlass.DataType,
-        element_b: cutlass.DataType,
-        element_accumulator: cutlass.DataType,
-        layout_a: cutlass.LayoutType,
-        layout_b: cutlass.LayoutType,
+        element_a: cutlass_library.DataType,
+        element_b: cutlass_library.DataType,
+        element_accumulator: cutlass_library.DataType,
+        layout_a: cutlass_library.LayoutType,
+        layout_b: cutlass_library.LayoutType,
    ) -> set:
        """
        Returns a set of operation classes that support the provided data type combination

        :param element_a: data type of operand A
-        :type element_a: cutlass.DataType
+        :type element_a: cutlass_library.DataType
        :param element_b: data type of operand B
-        :type element_b: cutlass.DataType
+        :type element_b: cutlass_library.DataType
        :param element_accumulator: data type of accumulator
-        :type element_accumulator: cutlass.DataType
+        :type element_accumulator: cutlass_library.DataType
        :param layout_a: layout of operand A
-        :type layout_a: cutlass.LayoutType
+        :type layout_a: cutlass_library.LayoutType
        :param layout_b: layout of operand B
-        :type layout_b: cutlass.LayoutType
+        :type layout_b: cutlass_library.LayoutType

        :return: set of operation classes that support the provided data type combination
        :rtype: set
@@ -422,28 +451,28 @@ class ArchOptions:

    def operations(
        self,
-        op_class: cutlass.OpcodeClass,
-        element_a: cutlass.DataType,
-        element_b: cutlass.DataType,
-        element_accumulator: cutlass.DataType,
-        layout_a: cutlass.LayoutType,
-        layout_b: cutlass.LayoutType,
+        op_class: cutlass_library.OpcodeClass,
+        element_a: cutlass_library.DataType,
+        element_b: cutlass_library.DataType,
+        element_accumulator: cutlass_library.DataType,
+        layout_a: cutlass_library.LayoutType,
+        layout_b: cutlass_library.LayoutType,
    ) -> KernelsForDataType:
        """
        Returns whether the provided operation class supports the provided data type combination

        :param op_class: operation class to consider
-        :type op_class: cutlass.OpcodeClass
+        :type op_class: cutlass_library.OpcodeClass
        :param element_a: data type of operand A
-        :type element_a: cutlass.DataType
+        :type element_a: cutlass_library.DataType
        :param element_b: data type of operand B
-        :type element_b: cutlass.DataType
+        :type element_b: cutlass_library.DataType
        :param element_accumulator: data type of accumulator
-        :type element_accumulator: cutlass.DataType
+        :type element_accumulator: cutlass_library.DataType
        :param layout_a: layout of operand A
-        :type layout_a: cutlass.LayoutType
+        :type layout_a: cutlass_library.LayoutType
        :param layout_b: layout of operand B
-        :type layout_b: cutlass.LayoutType
+        :type layout_b: cutlass_library.LayoutType

        :return: container of kernels by alignment supported by the provided combination of parameters
        :rtype: KernelsForDataType
@@ -469,13 +498,13 @@ class OptionRegistry:
    def __init__(self, target_cc: int):
        self.registry = {}

-        gemm_kinds = [cutlass.GemmKind.Universal, cutlass.GemmKind.Universal3x]
-        operation_kinds = [cutlass.OperationKind.Gemm, cutlass.OperationKind.Conv2d]
+        gemm_kinds = [cutlass_library.GemmKind.Universal, cutlass_library.GemmKind.Universal3x]
+        operation_kinds = [cutlass_library.OperationKind.Gemm, cutlass_library.OperationKind.Conv2d]
        # Construct options for each CC
        for kernel_cc in _generator_ccs:
            self.registry[kernel_cc] = {}
            for opkind in operation_kinds:
                self.registry[kernel_cc][opkind] = ArchOptions(target_cc, kernel_cc, opkind, gemm_kinds)

-    def options_for_cc(self, cc: int, op_kind=cutlass.OperationKind.Gemm) -> ArchOptions:
+    def options_for_cc(self, cc: int, op_kind=cutlass_library.OperationKind.Gemm) -> ArchOptions:
        return self.registry.get(cc, None)[op_kind]
--- a/python/cutlass/op/conv.py
+++ b/python/cutlass/op/conv.py
@@ -112,15 +112,18 @@
        args.sync()
 """

-import cutlass
-from cutlass import epilogue
-from cutlass import (
+from cutlass_library import (
    ConvKind,
    ConvMode,
+    DataTypeSize,
    IteratorAlgorithm,
+    OperationKind,
    SplitKMode,
    StrideSupport,
 )
+
+import cutlass
+from cutlass import epilogue
 from cutlass.backend import compiler
 from cutlass.backend.conv2d_operation import Conv2dArguments, Conv2dOperation
 from cutlass.backend.reduction_operation import ReductionOperation, ReductionArguments
@@ -202,7 +205,7 @@ class Conv2d(OperationBase):
        element_accumulator=None,
        cc: int = None, kernel_cc: int = None
    ):
-        super().__init__(cc=cc, kernel_cc=kernel_cc, operation_kind=cutlass.OperationKind.Conv2d)
+        super().__init__(cc=cc, kernel_cc=kernel_cc, operation_kind=OperationKind.Conv2d)
        # Verify the kernel cc
        if self.current_cc == 90:
            # The Conv2d kernel on Hopper (SM90) is currently unsupported
@@ -305,11 +308,11 @@ class Conv2d(OperationBase):
            self._reset_epilogue_functor_activation(epilogue.identity)

        self.alignment_pref_A = min(
-            128 // cutlass.DataTypeSize[self._element_a], max(self.possible_operations.alignments))
+            128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
        self.alignment_pref_B = min(
-            128 // cutlass.DataTypeSize[self._element_b], max(self.possible_operations.alignments))
+            128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
        self.alignment_pref_C = min(
-            128 // cutlass.DataTypeSize[self._element_c], max(self.possible_operations.alignments))
+            128 // DataTypeSize[self._element_c], max(self.possible_operations.alignments("C")))

    #
    # Tile description Related
@@ -342,8 +345,7 @@ class Conv2d(OperationBase):
            return
        if isinstance(td, dict):
            if self._tile_description is None:
-                alignment = list(self.possible_operations.kernels_by_alignment.keys())[0]
-                op = self.possible_operations.operations(alignment)[0]
+                op = self.possible_operations.default_operation()
                self._tile_description = datatypes.td_from_profiler_op(op)
            if "cluster_shape" in td.keys():
                if td["cluster_shape"] != [1, 1, 1]:
@@ -567,8 +569,7 @@ class Conv2d(OperationBase):
            if self.tile_description is not None:
                tile_description = self.tile_description
            else:
-                min_alignment = min([alignment_A, alignment_B, alignment_C])
-                op = self.possible_operations.operations(min_alignment)[0]
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C)[0]
                tile_description = datatypes.td_from_profiler_op(op)
        else:
            valid, err_str = self._valid_tile_description(tile_description)
@@ -753,6 +754,8 @@ class Conv2d(OperationBase):
        :return: arguments passed in to the kernel
        :rtype: cutlass.backend.Conv2dArguments
        """
+        super().run_setup()
+
        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
@@ -782,9 +785,9 @@ class Conv2d(OperationBase):
        shape_c = datatypes.get_tensor_shape(C, op="CONV")

        # Get the alignment
-        alignment_a = self.possible_operations.find_alignment(shape_a, self._layout_a)
-        alignment_b = self.possible_operations.find_alignment(shape_b, self._layout_b)
-        alignment_c = self.possible_operations.find_alignment(shape_c, self._layout_c)
+        alignment_a = self.possible_operations.find_alignment(shape_a, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(shape_b, self._layout_b, operand="B")
+        alignment_c = self.possible_operations.find_alignment(shape_c, self._layout_c, operand="C")

        alignment_a = check.update_alignment(alignment_a, self.alignment_pref_A)
        alignment_b = check.update_alignment(alignment_b, self.alignment_pref_B)
@@ -858,6 +861,10 @@ class Conv2d(OperationBase):
        if sync:
            if split_k[0] == "parallel" and split_k[1] > 1:
                reduction_arguments.sync()
+
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
            else:
                arguments.sync()

--- a/python/cutlass/op/gemm.py
+++ b/python/cutlass/op/gemm.py
@@ -116,12 +116,14 @@

 from math import prod

-import cutlass
-from cutlass import (
-    epilogue,
-    swizzle,
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
    GemmUniversalMode,
 )
+
+import cutlass
+from cutlass import epilogue, swizzle
 from cutlass.backend import compiler
 from cutlass.backend.evt import EpilogueFunctorVisitor
 from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
@@ -292,7 +294,7 @@ class Gemm(OperationBase):
                            f'combination {datatype_comb}x{layout_comb}')

        if reset_epilogue:
-            self._reset_epilogue_functor_activation(epilogue.identity)
+            self._reset_epilogue_functor_activation(cutlass.epilogue.identity)

    @property
    def swizzling_functor(self):
@@ -308,7 +310,7 @@ class Gemm(OperationBase):
        """
        Sets the swizzling functor to the type specified by `swizzling_functor`
        """
-        if swizzling_functor == swizzle.ThreadblockSwizzleStreamK:
+        if swizzling_functor == cutlass.swizzle.ThreadblockSwizzleStreamK:
            if self.op_class == cutlass.OpcodeClass.Simt:
                raise Exception('ThreadblockSwizzleStreamK is currently only supported with opcode class TensorOp')

@@ -347,8 +349,7 @@ class Gemm(OperationBase):
            return
        if isinstance(td, dict):
            if self._tile_description is None:
-                alignment = list(self.possible_operations.kernels_by_alignment.keys())[0]
-                op = self.possible_operations.operations(alignment)[0]
+                op = self.possible_operations.default_operation()
                self._tile_description = datatypes.td_from_profiler_op(op)
            td = self._tile_description.clone_and_update(td)

@@ -414,22 +415,25 @@ class Gemm(OperationBase):
        :return: operation that was constructed
        :rtype: cutlass.backend.GemmOperationUniversal
        """
-        alignment_pref_A = min(128 // cutlass.DataTypeSize[self._element_a], max(self.possible_operations.alignments))
-        alignment_pref_B = min(128 // cutlass.DataTypeSize[self._element_b], max(self.possible_operations.alignments))
-        alignment_pref_C = min(128 // cutlass.DataTypeSize[self._element_c], max(self.possible_operations.alignments))
+        alignment_pref_A = min(128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        alignment_pref_B = min(128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
        alignment_A = check.alignment_or_default(alignment_A, alignment_pref_A)
        alignment_B = check.alignment_or_default(alignment_B, alignment_pref_B)
-        alignment_C = check.alignment_or_default(alignment_C, alignment_pref_C)
-
-        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)

        tensor_A = TensorDescription(self._element_a, self._layout_a, alignment_A)
        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+
+        alignment_pref_C = max(self.possible_operations.alignments("C"))
+        if self._element_c != DataType.void:
+            alignment_pref_C = min(128 // DataTypeSize[self._element_c], alignment_pref_C)
+
+        alignment_C = check.alignment_or_default(alignment_C, alignment_pref_C)
        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)

        if tile_description is None:
            if self._tile_description is None:
-                op = self.possible_operations.operations(alignment_A)[0]
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C)[0]
                tile_description = datatypes.td_from_profiler_op(op)
            else:
                tile_description = self._tile_description
@@ -527,7 +531,7 @@ class Gemm(OperationBase):
        :return: stride between each matrix in the batch
        :rtype: int
        """
-        if len(tensor.shape) > 2:
+        if tensor is not None and len(tensor.shape) > 2:
            return tensor.shape[-2] * tensor.shape[-1]
        else:
            return 0
@@ -566,12 +570,14 @@ class Gemm(OperationBase):
            B_row = self._layout_b == cutlass.LayoutType.RowMajor
            C_row = self._layout_c == cutlass.LayoutType.RowMajor

-            batched = lambda x : len(x.shape) > 2 and prod(x.shape[:-2]) == batch_count
+            # Consider a Tensor to be batched if its rank is > 2 and
+            # the product of the modes beyond rank 2 equals our pre-determined batch size.
+            batched = lambda x : x is None or (len(x.shape) > 2 and prod(x.shape[:-2]) == batch_count)

-            if batched(A) and not batched(B) and batched(C) and A_row and C_row:
+            if batched(A) and not batched(B) and (C is None or batched(C)) and A_row and C_row:
                M *= batch_count
                returned_batch_count = 1
-            elif not batched(A) and batched(B) and batched(C) and not B_row and not C_row:
+            elif not batched(A) and batched(B) and (C is None or batched(C)) and not B_row and not C_row:
                N *= batch_count
                returned_batch_count = 1
            else:
@@ -625,6 +631,7 @@ class Gemm(OperationBase):
        :return: arguments passed in to the kernel
        :rtype: cutlass.backend.GemmArguments
        """
+        super().run_setup()
        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
@@ -632,14 +639,20 @@ class Gemm(OperationBase):
        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")

+        is_void_c = self._element_c == DataType.void
+
        self._verify_rank(A)
        self._verify_rank(B)
-        self._verify_rank(C)
+        if not is_void_c:
+            self._verify_rank(C)
        self._verify_rank(D)

-        alignment_a = self.possible_operations.find_alignment(A.shape, self._layout_a)
-        alignment_b = self.possible_operations.find_alignment(B.shape, self._layout_b)
-        alignment_c = self.possible_operations.find_alignment(C.shape, self._layout_c)
+        alignment_a = self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B")
+
+        # Set C alignment based on D.shape so as to correctly get an alignment with void-C
+        # kernels, for which `C` is None.
+        alignment_c = self.possible_operations.find_alignment(D.shape, self._layout_c, operand="C")
        self.compile(self._tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
                     alignment_C=alignment_c, print_module=print_module)

--- a/python/cutlass/op/gemm_grouped.py
+++ b/python/cutlass/op/gemm_grouped.py
@@ -51,7 +51,8 @@
        plan.run([A0, A1], [B0, B1], [C0, C1], [D0, D1])
 """

-from cutlass import DataTypeSize
+from cutlass_library import DataTypeSize
+
 from cutlass.backend.gemm_operation import (
    GemmGroupedArguments,
    GemmOperationGrouped,
@@ -162,10 +163,9 @@ class GroupedGemm(Gemm):
        :return: operation that was constructed
        :rtype: cutlass.backend.GemmOperationGrouped
        """
-        alignment_preference = max(self.possible_operations.alignments)
-        alignment_A = check.alignment_or_default(alignment_A, alignment_preference)
-        alignment_B = check.alignment_or_default(alignment_B, alignment_preference)
-        alignment_C = check.alignment_or_default(alignment_C, alignment_preference)
+        alignment_A = check.alignment_or_default(alignment_A, max(self.possible_operations.alignments("A")))
+        alignment_B = check.alignment_or_default(alignment_B, max(self.possible_operations.alignments("B")))
+        alignment_C = check.alignment_or_default(alignment_C, max(self.possible_operations.alignments("C")))

        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)

@@ -174,7 +174,7 @@ class GroupedGemm(Gemm):
        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)

        if tile_description is None:
-            op = self.possible_operations.operations(alignment_A)[0]
+            op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C)[0]
            tile_description = datatypes.td_from_profiler_op(op)
        else:
            valid, err_str = self._valid_tile_description(tile_description)
@@ -221,6 +221,8 @@ class GroupedGemm(Gemm):
        :return: arguments passed in to the kernel
        :rtype: cutlass.backend.GemmGroupedArguments
        """
+        super().run_setup()
+
        if len(A) != len(B) or len(A) != len(C) or len(A) != len(D):
            raise Exception("Lengths of A, B, C, and D lists must be equal")

@@ -236,9 +238,9 @@ class GroupedGemm(Gemm):
        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")

-        alignment_a = min((self.possible_operations.find_alignment(A.shape, self._layout_a) for A in As))
-        alignment_b = min((self.possible_operations.find_alignment(B.shape, self._layout_b) for B in Bs))
-        alignment_c = min((self.possible_operations.find_alignment(C.shape, self._layout_c) for C in Cs))
+        alignment_a = min((self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A") for A in As))
+        alignment_b = min((self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B") for B in Bs))
+        alignment_c = min((self.possible_operations.find_alignment(C.shape, self._layout_c, operand="C") for C in Cs))
        self.compile(self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
                     alignment_C=alignment_c, print_module=print_module)

--- a/python/cutlass/op/op.py
+++ b/python/cutlass/op/op.py
@@ -36,11 +36,13 @@ Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv

 from bisect import bisect_left

+from cutlass_library import DataType, DataTypeSize, OperationKind, SharedMemPerCC
+
 import cutlass
-from cutlass import option_registry, epilogue
+from cutlass import get_option_registry
 from cutlass.backend.evt import EpilogueFunctorVisitor
 from cutlass.backend.utils.device import device_cc
-from cutlass.epilogue import get_activations
+from cutlass.epilogue import get_activations, get_activation_epilogue, identity
 from cutlass.library_defaults import KernelsForDataType, _generator_ccs
 from cutlass.swizzle import get_swizzling_functors
 from cutlass.utils import datatypes, check
@@ -51,12 +53,14 @@ class OperationBase:
    Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
    """

-    def __init__(self, cc: int = None, kernel_cc: int = None, operation_kind = cutlass.OperationKind.Gemm):
+    def __init__(self, cc: int = None, kernel_cc: int = None, operation_kind = OperationKind.Gemm):
        """
        :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
        :type cc: int
        :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
        :type kernel_cc: int
+        :param operation_kind: class of operation that will be performed (e.g., GEMM, Conv)
+        :type operation_kind: cutlass_library.OperationKind
        """
        self.operation_kind = operation_kind
        self.cc = cc if cc is not None else device_cc()
@@ -64,13 +68,13 @@ class OperationBase:
        self.current_cc = kernel_cc if kernel_cc is not None else self._find_closest_cc(self.cc)
        self.tile_description = None

-        self.options = option_registry.options_for_cc(self.current_cc, operation_kind)
+        self.options = get_option_registry().options_for_cc(self.current_cc, operation_kind)

        if self.options is None:
            raise Exception(f"Invalid or unsupported compute capability: {self.current_cc}")

        # Default activation function: identity
-        self._activation = epilogue.identity
+        self._activation = identity

    def _find_closest_cc(self, cc: int) -> int:
        """
@@ -120,7 +124,7 @@ class OperationBase:
            if cc not in _generator_ccs:
                raise Exception(f'Invalid CC for CUTLASS kernels: {cc}.')
            self.current_cc = cc
-            self.options = option_registry.options_for_cc(self.current_cc, self.operation_kind)
+            self.options = get_option_registry().options_for_cc(self.current_cc, self.operation_kind)

    def _verify_scalar(self, scalar, ref_scalar, ref_dtype, name):
        """
@@ -158,9 +162,12 @@ class OperationBase:
    def _verify_tensor(self, tensor, ref_tensor, ref_dtype, ref_layout, name):
        """
        Verifies the following properties:
-            1) Either ``tensor`` or ``ref_tensor`` must be set (i.e., not ``None``)
-            2) If ``tensor`` is not ``None``, its datatype and layout must match matches the current versions
-               set by the plan (i.e., those in ``ref_dtype`` and ``ref_layout``)
+            If ref_dtype is not void:
+                1) Either ``tensor`` or ``ref_tensor`` must be set (i.e., not ``None``)
+                2) If ``tensor`` is not ``None``, its datatype and layout must match matches the current versions
+                set by the plan (i.e., those in ``ref_dtype`` and ``ref_layout``)
+            If ref_dtype is void:
+                Neither ``tensor`` nor ``ref_tensor`` are set

        If either of these properties does not hold, an exception is raised. If these properties hold and
        ``tensor`` is not ``None``, ``tensor`` is returned. Otherwise, ``ref_tensor`` is returned.
@@ -177,6 +184,11 @@ class OperationBase:
        :return: valid tensor object to use
        :rtype: numpy/cupy/torch array/tensor object
        """
+        if ref_dtype == DataType.void:
+            if tensor is not None or ref_tensor is not None:
+                raise Exception("Operands with element DataType.void must not be provided a tensor")
+            return None
+
        if tensor is None:
            if ref_tensor is None:
                raise Exception(f"Tensor {name} must be set.")
@@ -211,58 +223,60 @@ class OperationBase:
                f'({self._element_a}, {self._element_b}, {self._element_accumulator}) and '
                f'layout combination ({self._layout_a}, {self._layout_b}).')

-        # Changing the op class changes the elements per access in the epilogue. Reset this.
-        if self.op_class == cutlass.OpcodeClass.Simt:
-            elements_per_access = 1
-        else:
-            elements_per_access = 128 // cutlass.DataTypeSize[self._element_c]
-
-        if self.epilogue_functor is not None:
-            self.epilogue_functor = self._reset_epilogue_functor_alignment(elements_per_access, self.epilogue_functor)
-
        # Changing the op class also changes the possible operations available. Reset these.
        self.possible_operations = self.options.operations(
            self.op_class, self._element_a, self._element_b,
            self._element_accumulator, self._layout_a, self._layout_b)

+        # Changing the op class changes the elements per access in the epilogue. Reset this.
+        if self.epilogue_functor is not None:
+            self.epilogue_functor = self._reset_epilogue_functor_alignment(self._elements_per_access(), self.epilogue_functor)
+
    #
    # Epilogue
    #

+    def _elements_per_access(self):
+        if self.op_class == cutlass.OpcodeClass.Simt:
+            return 1
+        elif self._element_c != DataType.void:
+            return 128 // DataTypeSize[self._element_c]
+        else:
+            return 128 // max(self.possible_operations.alignments("C"))
+
    def _create_epilogue_functor_activation(self, activation):
        """
        Returns the epilogue functor with given activation function
        """
        if self.epilogue_functor is None:
-            if self.op_class == cutlass.OpcodeClass.Simt:
-                elements_per_access = 1
-            else:
-                elements_per_access = 128 // cutlass.DataTypeSize[self._element_c]
+            elements_per_access = self._elements_per_access()
        else:
            elements_per_access = self.epilogue_functor.epilogue_vector_length

        if not self.specified_kernel_cc:
-            if self.current_cc == 90 and activation != epilogue.identity:
-                # CUTLASS 3.0 kernels currently only support identity activation. If one requests a non-identity activation,
+            if self.current_cc == 90 and activation != identity:
+                # CUTLASS 3.0 kernels in Python currently only support identity activation. If one requests a non-identity activation,
                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
                cutlass.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                if self._element_c != self._element_d:
+                    raise Exception("CUTLASS 2.x kernels require element C to be the same as element D")
                self._reset_options(80)
                self._reset_operations(reset_epilogue=False)
-            elif (self.cc == 90 and self.current_cc != 90 and activation == epilogue.identity):
+            elif (self.cc == 90 and self.current_cc != 90 and activation == identity):
                # SM80 fallback kernels are currently used. Since an identity activation is requested,
                # we can switch back to using SM90 kernels.
                self._reset_options(90)
                self._reset_operations(reset_epilogue=False)
        else:
-            if self.current_cc == 90 and activation != epilogue.identity:
+            if self.current_cc == 90 and activation != identity:
                raise Exception("Epilogues with elementwise fusion are not currently supported "
                                "in the Python interface for 3.x kernels. To use 2.x kernels "
                                "with fused elementwise epilogues, do not set the `kernel_cc` "
                                "parameter when constructing the Gemm object.")

-        return epilogue.get_activation_epilogue(
+        return get_activation_epilogue(
            activation,
-            self._element_c,
+            self._element_d,
            elements_per_access,
            self._element_accumulator,
            self._element_accumulator,
@@ -283,13 +297,13 @@ class OperationBase:

        if epilogue_functor is None or not hasattr(epilogue_functor, 'activation_functor'):
            # Identity epilogue does not have 'activation_functor'
-            activation = epilogue.identity
+            activation = identity
        else:
            activation = epilogue_functor.activation_functor

-        epilogue_functor = epilogue.get_activation_epilogue(
+        epilogue_functor = get_activation_epilogue(
            activation,
-            self._element_c,
+            self._element_d,
            alignment,
            self._element_accumulator,
            self._element_accumulator,
@@ -304,7 +318,7 @@ class OperationBase:
        if hasattr(self.epilogue_functor, "activation_functor"):
            return self.epilogue_functor.activation_functor
        else:
-            return epilogue.identity
+            return identity

    @activation.setter
    def activation(self, act):
@@ -363,8 +377,8 @@ class OperationBase:
            epilogue_smem_bytes = self.epilogue_functor.get_smem_size(td)

            # Verify the maximum number of mainloop stages
-            mainloop_smem_per_stage = check.calculate_smem_usage_per_stage(td, cutlass.OperationKind.Gemm)
-            smem_capacity_bytes = cutlass.SharedMemPerCC[self.cc] << 10
+            mainloop_smem_per_stage = check.calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+            smem_capacity_bytes = SharedMemPerCC[self.cc] << 10
            mainloop_stages = (smem_capacity_bytes - epilogue_smem_bytes) // mainloop_smem_per_stage
            if mainloop_stages < 2:
                # Mainloop stages must >= 2
@@ -376,3 +390,11 @@ class OperationBase:
                "The epilogue consumes too much shared memory. "
                "No valid tile description is found in the generator.")
        self.possible_operations = new_possible_operations
+
+
+    def run_setup(self):
+        """
+        Steps that must be taken before caling `plan.run()`
+        """
+        # Initialize the memory pool if, if not already done
+        cutlass.get_memory_pool()
--- a/python/cutlass/profiler/init.py
+++ b/python/cutlass/profiler/init.py
@@ -1,37 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Profilers for Python Interface
-"""
-
-from cutlass.profiler.event_profiler import CUDAEventProfiler
--- a/python/cutlass/shape.py
+++ b/python/cutlass/shape.py
@@ -34,7 +34,7 @@
 Utilities for expressing shapes
 """

-from cutlass import (
+from cutlass_library import (
    ConvMode,
    ConvKind,
    LayoutType
@@ -64,7 +64,7 @@ class MatrixCoord:
        Returns the leading dimension for a matrix with layout ``layout`` and shape provided by the MatrixCoord.

        :param layout: layout of matrix
-        :type layout: cutlass.LayoutType
+        :type layout: cutlass_library.LayoutType

        :returns: leading dimension
        :rtype: int
--- a/python/cutlass/swizzle.py
+++ b/python/cutlass/swizzle.py
@@ -34,7 +34,7 @@
 Registry of swizzling functions
 """

-from cutlass import SwizzlingFunctor
+from cutlass_library import SwizzlingFunctor


 IdentitySwizzle1 = SwizzlingFunctor.Identity1
--- a/python/cutlass/utils/check.py
+++ b/python/cutlass/utils/check.py
@@ -36,26 +36,27 @@ Utility functions for checking constraints on kernels and calculating kernel att

 import ctypes

+from cutlass_library import DataTypeSize, OperationKind, SharedMemPerCC
+
 import cutlass
-from cutlass import DataTypeSize
 from cutlass.backend.library import TileDescription


-def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: cutlass.OperationKind) -> int:
+def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: OperationKind) -> int:
    """
    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.

    :param td: tile description to compute shared memory of
    :type td: TileDescription
    :param operation_kind: identifier for the type of operation being performed
-    :type operation_kind: cutlass.OperationKind
+    :type operation_kind: cutlass_library.OperationKind

    :return: number of bytes of shared memory consumed by a single stage
    :rtype: int
    """
    m, n, k = td.threadblock_shape

-    if operation_kind == cutlass.OperationKind.Gemm:
+    if operation_kind == OperationKind.Gemm:
        stage_barrier_bytes = 32
        return (
            (DataTypeSize[td.math_instruction.element_a] * m * k // 8)
@@ -82,7 +83,8 @@ def valid_stage_count(
    kernel_cc: int,
    td: TileDescription,
    element_C: cutlass.DataType = None,
-    element_D: cutlass.DataType = None) -> tuple:
+    element_D: cutlass.DataType = None,
+    verbose: bool = True) -> tuple:
    """
    Checks whether a device with `cc` supports the number of stages within `tile_description`, both
    based on raw limits on the number of stages and based on shared memory capacity
@@ -97,6 +99,8 @@ def valid_stage_count(
    :type element_C: cutlass.DataType
    :param element_D: data type of operand D
    :type element_D: cutlass.DataType
+    :param verbose: whether to log warnings
+    :type verbose: bool

    :return: tuple with the first element indicating whether the provided tile description is
             valid for the provided device and the second element being an error message
@@ -107,7 +111,7 @@ def valid_stage_count(
            # Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically
            # determines the stage count to use. Thus, all settings are valid in these scenarios.
            return (True, "")
-        else:
+        elif verbose:
            cutlass.logger.warning(
                "Setting an explicit stage count for SM90 kernels currently may "
                "result in compilation errors if the combination of tile shape, "
@@ -125,9 +129,9 @@ def valid_stage_count(
    # only catches cases in which the mainloop exceeds the device's shared memory capacity.
    # This is not a concern for CUTLASS 2.x kernels, for which the shared memory of the
    # mainloop and epilogue is shared.
-    smem_per_stage = calculate_smem_usage_per_stage(td, cutlass.OperationKind.Gemm)
+    smem_per_stage = calculate_smem_usage_per_stage(td, OperationKind.Gemm)
    smem_usage_mainloop = (smem_per_stage * td.stages)
-    smem_arch = cutlass.SharedMemPerCC[cc] << 10
+    smem_arch = SharedMemPerCC[cc] << 10
    if smem_usage_mainloop > smem_arch:
        return ( False,
            "Configuration uses too much shared memory. Consider reducing stage count or tile shape.\n"
@@ -214,7 +218,9 @@ def valid_schedule(
        return (False, "Kernel and epilogue schedules must either both be auto or neither be auto")

    if not tile_scheduler_default:
-        if (tile_scheduler == cutlass.TileSchedulerType.StreamK) and (kernel_schedule != cutlass.KernelScheduleType.TmaWarpSpecializedCooperative):
+        cooperative_kernels = [cutlass.KernelScheduleType.TmaWarpSpecializedCooperative, 
+                               cutlass.KernelScheduleType.CpAsyncWarpSpecializedCooperative]
+        if (tile_scheduler == cutlass.TileSchedulerType.StreamK) and (kernel_schedule not in cooperative_kernels):
            return (False, "Stream-K tile scheduler is currently only supported with the cooperative kernel schedule")
    return (True, "")

--- a/python/cutlass/utils/datatypes.py
+++ b/python/cutlass/utils/datatypes.py
@@ -35,33 +35,55 @@ Utility functions for converting between frontend datatypes and CUTLASS datatype
 """

 import cutlass
-from cutlass import (
+from cutlass_library import (
    DataTypeSize,
+    MathOperation,
+    MathInstruction
 )
 from cutlass.backend.library import (
-    MathInstruction,
-    MathOperation,
    TileDescription,
 )

-try:
-    import numpy as np
+bfloat16_available = None
+cupy_available = None
+numpy_available = None
+torch_available = None
+_library_to_cupy_dict = None
+_library_to_numpy_dict = None
+_library_to_torch_dict = None
+_torch_to_library_dict = None

-    numpy_available = True
-    _library_to_numpy_dict = {
-        cutlass.DataType.f16: np.float16,
-        cutlass.DataType.f32: np.float32,
-        cutlass.DataType.f64: np.float64,
-        cutlass.DataType.s8: np.int8,
-        cutlass.DataType.s32: np.int32,
-    }
-except ImportError:
-    numpy_available = False
-    _library_to_numpy_dict = {}
+
+def is_numpy_available():
+    global numpy_available, _library_to_numpy_dict
+    if numpy_available is None:
+        try:
+            import numpy as np
+
+            numpy_available = True
+            _library_to_numpy_dict = {
+                cutlass.DataType.f16: np.float16,
+                cutlass.DataType.f32: np.float32,
+                cutlass.DataType.f64: np.float64,
+                cutlass.DataType.s8: np.int8,
+                cutlass.DataType.s32: np.int32,
+            }
+        except ImportError:
+            numpy_available = False
+            _library_to_numpy_dict = {}
+    return numpy_available
+
+
+def is_numpy_tensor(inp) -> bool:
+    if is_numpy_available():
+        import numpy as np
+        return isinstance(inp, np.ndarray)
+    return False


 def numpy_library_type(inp) -> cutlass.DataType:
-    if numpy_available:
+    if is_numpy_available():
+        import numpy as np
        if inp == np.float16:
            return cutlass.DataType.f16
        elif inp == np.float32:
@@ -79,24 +101,36 @@ def numpy_type(inp):
    return _library_to_numpy_dict.get(inp, None)


-try:
-    import cupy as cp
+def is_cupy_available():
+    global cupy_available
+    if cupy_available is None:
+        try:
+            import cupy as cp

-    cupy_available = True
-    _library_to_cupy_dict = {
-        cutlass.DataType.f16: cp.float16,
-        cutlass.DataType.f32: cp.float32,
-        cutlass.DataType.f64: cp.float64,
-        cutlass.DataType.s8: cp.int8,
-        cutlass.DataType.s32: cp.int32,
-    }
-except ImportError:
-    cupy_available = False
-    _library_to_cupy_dict = {}
+            cupy_available = True
+            _library_to_cupy_dict = {
+                cutlass.DataType.f16: cp.float16,
+                cutlass.DataType.f32: cp.float32,
+                cutlass.DataType.f64: cp.float64,
+                cutlass.DataType.s8: cp.int8,
+                cutlass.DataType.s32: cp.int32,
+            }
+        except ImportError:
+            cupy_available = False
+            _library_to_cupy_dict = {}
+    return cupy_available
+
+
+def is_cupy_tensor(inp) -> bool:
+    if is_cupy_available():
+        import cupy as cp
+        return isinstance(inp, cp.ndarray)
+    return False


 def cupy_library_type(inp) -> cutlass.DataType:
-    if cupy_available:
+    if is_cupy_available():
+        import cupy as cp
        if inp == cp.float16:
            return cutlass.DataType.f16
        elif inp == cp.float32:
@@ -110,39 +144,50 @@ def cupy_type(inp):
    return _library_to_cupy_dict.get(inp, None)


-try:
-    import torch
+def is_torch_available():
+    global torch_available, _library_to_torch_dict, _torch_to_library_dict
+    if torch_available is None:
+        try:
+            import torch

-    torch_available = True
-    _torch_to_library_dict = {
-        torch.half: cutlass.DataType.f16,
-        torch.float16: cutlass.DataType.f16,
-        torch.bfloat16: cutlass.DataType.bf16,
-        torch.float: cutlass.DataType.f32,
-        torch.float32: cutlass.DataType.f32,
-        torch.double: cutlass.DataType.f64,
-        torch.float64: cutlass.DataType.f64,
-        torch.int8: cutlass.DataType.s8,
-        torch.int32: cutlass.DataType.s32,
-        torch.uint8: cutlass.DataType.u8,
-    }
+            torch_available = True
+            _torch_to_library_dict = {
+                torch.half: cutlass.DataType.f16,
+                torch.float16: cutlass.DataType.f16,
+                torch.bfloat16: cutlass.DataType.bf16,
+                torch.float: cutlass.DataType.f32,
+                torch.float32: cutlass.DataType.f32,
+                torch.double: cutlass.DataType.f64,
+                torch.float64: cutlass.DataType.f64,
+                torch.int8: cutlass.DataType.s8,
+                torch.int32: cutlass.DataType.s32,
+                torch.uint8: cutlass.DataType.u8,
+            }

-    _library_to_torch_dict = {
-        cutlass.DataType.f16: torch.half,
-        cutlass.DataType.f16: torch.float16,
-        cutlass.DataType.bf16: torch.bfloat16,
-        cutlass.DataType.f32: torch.float,
-        cutlass.DataType.f32: torch.float32,
-        cutlass.DataType.f64: torch.double,
-        cutlass.DataType.f64: torch.float64,
-        cutlass.DataType.s8: torch.int8,
-        cutlass.DataType.s32: torch.int32,
-        cutlass.DataType.u8: torch.uint8,
-    }
-except ImportError:
-    torch_available = False
-    _torch_to_library_dict = {}
-    _library_to_torch_dict = {}
+            _library_to_torch_dict = {
+                cutlass.DataType.f16: torch.half,
+                cutlass.DataType.f16: torch.float16,
+                cutlass.DataType.bf16: torch.bfloat16,
+                cutlass.DataType.f32: torch.float,
+                cutlass.DataType.f32: torch.float32,
+                cutlass.DataType.f64: torch.double,
+                cutlass.DataType.f64: torch.float64,
+                cutlass.DataType.s8: torch.int8,
+                cutlass.DataType.s32: torch.int32,
+                cutlass.DataType.u8: torch.uint8,
+            }
+        except ImportError:
+            torch_available = False
+            _torch_to_library_dict = {}
+            _library_to_torch_dict = {}
+    return torch_available
+
+
+def is_torch_tensor(inp) -> bool:
+    if is_torch_available():
+        import torch
+        return isinstance(inp, torch.Tensor)
+    return False


 def torch_library_type(inp) -> cutlass.DataType:
@@ -153,28 +198,35 @@ def torch_type(inp):
    return _library_to_torch_dict.get(inp, None)


-try:
-    import bfloat16
+def is_bfloat16_available():
+    global bfloat16_available

-    bfloat16_available = True
-except ImportError:
-    bfloat16_available = False
+    if bfloat16_available is None:
+        try:
+            import bfloat16
+
+            bfloat16_available = True
+        except ImportError:
+            bfloat16_available = False
+    return bfloat16_available


 def bfloat16_library_type(inp) -> cutlass.DataType:
-    if bfloat16_available:
+    if is_bfloat16_available():
+        import bfloat16
        if inp == bfloat16.bfloat16:
            return cutlass.DataType.bf16


 def bfloat16_type(inp):
-    if bfloat16_available:
+    if is_bfloat16_available():
+        import bfloat16
        if inp == cutlass.DataType.bf16:
            return bfloat16.bfloat16


 def library_type(inp):
-    if inp in cutlass.DataTypeSize.keys():
+    if inp in DataTypeSize:
        return inp

    for cvt_fn in [
@@ -205,23 +257,20 @@ def _tensor_from_torch(pt_tensor):


 def get_datatype_and_layout(tensor):
-    if (numpy_available and isinstance(tensor, np.ndarray)) or (
-        cupy_available and isinstance(tensor, cp.ndarray)
-    ):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
        return _tensor_from_numpy(tensor)
-    elif torch_available and isinstance(tensor, torch.Tensor):
+    elif is_torch_tensor(tensor):
        return _tensor_from_torch(tensor)
    elif isinstance(tensor, float) or isinstance(tensor, int):
        return (cutlass.DataType.f32, cutlass.LayoutType.RowMajor)
    else:
        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")

+
 def get_tensor_shape(tensor, op="GEMM"):
-    if (numpy_available and isinstance(tensor, np.ndarray)) or (
-        cupy_available and isinstance(tensor, cp.ndarray)
-    ):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
        return tensor.shape
-    elif torch_available and isinstance(tensor, torch.Tensor):
+    elif is_torch_tensor(tensor):
        size = tensor.size()
        if op == "CONV":
            # PyTorch Tensors have shape NCHW
@@ -237,7 +286,7 @@ def get_tensor_shape(tensor, op="GEMM"):
 _math_operation_value_map = {x.value: x for x in MathOperation}


-def backend_math_operation(math_op: cutlass.MathOperation):
+def backend_math_operation(math_op: MathOperation):
    if math_op.value not in _math_operation_value_map.keys():
        raise Exception(f"Unable to convert math operation of type {math_op} to backend math operation.")
    return _math_operation_value_map[math_op.value]
--- a/python/cutlass/profiler/event_profiler.py
+++ b/python/cutlass/profiler/event_profiler.py
@@ -39,12 +39,12 @@ import subprocess

 from cuda import cuda, cudart
 import numpy as np
-import torch

 from cutlass import CUTLASS_PATH
 from cutlass.backend.library import DataTypeSize
 from cutlass.op.op import OperationBase
 from cutlass.shape import GemmCoord
+from cutlass.utils.datatypes import is_numpy_tensor


 class GpuTimer: