[CuTeDSL] Add sub_packed_f32x2 operation

Add subtraction operation for packed f32x2 values, following the same pattern as the existing add_packed_f32x2 and mul_packed_f32x2 operations. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-19 22:38:56 +00:00 · 2026-02-04 21:18:46 +07:00
parent 6b3e607b85
commit 51935551fb
2 changed files with 4 additions and 0 deletions
--- a/python/CuTeDSL/cutlass/cute/arch/init.py
+++ b/python/CuTeDSL/cutlass/cute/arch/init.py
@@ -96,6 +96,7 @@ __all__ = [
    "fma_packed_f32x2",
    "mul_packed_f32x2",
    "add_packed_f32x2",
+    "sub_packed_f32x2",
    "fmax",
    "rcp_approx",
    "exp2",
--- a/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
+++ b/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
@@ -940,6 +940,9 @@ mul_packed_f32x2 = partial(
 add_packed_f32x2 = partial(
    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.add_packed_f32x2
 )
+sub_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.sub_packed_f32x2
+)


@dsl_user_op