WMMA grouped conv fwd large tensor extra flavors (#3582)

* Additional flavors for WMMA conv fwd large tensor - added F16/BF16 clamp operation - added F16/BF16 bias_clamp operation - small modification to the device code to accomodate extra tensors * changed strategy to handle GemmArgs array * Adding generic instance * Added generic instance to clamp and bias_clamp ops
2026-04-19 22:39:03 +00:00 · 2026-01-23 12:19:51 +01:00
parent 7b3db1a878
commit 81ee19bd2c
27 changed files with 1007 additions and 171 deletions
--- a/include/ck/utility/array.hpp
+++ b/include/ck/utility/array.hpp
@@ -6,6 +6,8 @@

 #include "functional2.hpp"
 #include "sequence.hpp"
+#include <type_traits>
+#include <cassert>

 namespace ck {

@@ -27,6 +29,15 @@ struct Array

    __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }

+    template <typename... Args>
+    __host__ constexpr auto Emplace(index_t i, Args&&... args)
+        -> std::enable_if_t<std::is_nothrow_constructible_v<TData, Args&&...>>
+    {
+        assert(i >= 0 && i < NSize);
+        mData[i].~TData();
+        new(mData + i) TData(ck::forward<Args>(args)...);
+    }
+
    template <typename T>
    __host__ __device__ constexpr auto operator=(const T& a)
    {