Step-3.5-Flash support (#1231)

* WIP * This works but is slow * Turn off the up / gate clamps for now * OK we need the clamping * Fuse the clamp (CUDA) * Fuse the clamp (CPU) * WIP * Be able to use merged q, k, v * Be able to use merged up/gate experts * Fuse the clamp (CUDA mmvq)
2026-04-26 01:19:20 +00:00 · 2026-02-05 08:13:22 +02:00
parent 8d952ff183
commit 9c1c74acda
22 changed files with 487 additions and 69 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5051,6 +5051,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_MINIMAX_M2:
        case LLM_ARCH_MIMO2:
        case LLM_ARCH_SEED_OSS:
+        case LLM_ARCH_STEP35:
            return LLAMA_ROPE_TYPE_NEOX;

        case LLM_ARCH_QWEN2VL: