Restructure gridwise and blockwise GEMM, add tensor contraction and FWD-v4r5 (#36)

* experimenting magic number division * overhauling fwd-v4r4 to clearly reflect transformation graph * added fwd-v4r5 * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * bug fix and added sanity-check in transform_dynamic_tensor_descriptor * added conv_driver_v2 [ROCm/composable_kernel commit: 30072aec37]
2026-05-19 20:40:07 +00:00 · 2021-06-09 23:53:08 -05:00
parent 040023fdcd
commit f4acec502e
38 changed files with 4791 additions and 2050 deletions
--- a/composable_kernel/include/utility/magic_division.hpp
+++ b/composable_kernel/include/utility/magic_division.hpp
@@ -118,6 +118,7 @@ struct MagicDivision
        return (tmp + dividend) >> shift;
    }

+#if 1 // debug
    // HACK: magic division for int32_t
    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
    // non-negative for result to be correct
@@ -127,8 +128,25 @@ struct MagicDivision
    {
        uint32_t dividend_u32 = as_type<uint32_t>(dividend_i32);
        uint32_t tmp          = ((uint64_t)dividend_u32 * (uint64_t)multiplier) >> 32;
-        return (tmp + dividend_i32) >> shift;
+        return (tmp + dividend_u32) >> shift;
    }
+#else
+    // the inline ASM is producing wrong result
+    __host__ __device__ static int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t r;
+        asm volatile("\n \
+                v_mul_hi_u32 %0, %1, %2 \n \
+                v_add_u32_e32 %0, %1, %0 \n \
+                v_lshrrev_b32_e32 %0, %3, %0 \n \
+                "
+                     : "=v"(r)
+                     : "v"(as_type<uint32_t>(dividend_i32)), "s"(multiplier), "s"(shift));
+
+        return as_type<int32_t>(r);
+    }
+#endif
 };

 } // namespace ck