ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext * cuda : implement soft_max_ext * ggml : implement soft_max_ext (CPU) * batched-bench : print threads ggml-ci * metal : simplify soft_max encoding ggml-ci * cuda : use 512 threads for soft_max instead of 32 * ggml : update soft max cpu * cuda : do warp-based block reduce * cuda : increase max block size to 1024 * cuda : fix warp reduction initialization of shared mem * metal : warp-based reduction for soft max kernel * metal : warp-based reduce for rms_norm * metal : simplify soft max kernel ggml-ci * alloc : fix build with debug
2026-04-30 19:31:48 +00:00 · 2023-12-01 10:51:24 +02:00
parent 49e1009f75
commit 88dbc5a1e9
8 changed files with 311 additions and 196 deletions
--- a/ggml.h
+++ b/ggml.h
@@ -1282,6 +1282,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // fused soft_max(a*scale + mask)
+    // mask is optional
+    GGML_API struct ggml_tensor * ggml_soft_max_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * mask,
+            float                 scale);
+
    GGML_API struct ggml_tensor * ggml_soft_max_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,