Step-3.5-Flash support (#1231)

* WIP

* This works but is slow

* Turn off the up / gate clamps for now

* OK we need the clamping

* Fuse the clamp (CUDA)

* Fuse the clamp (CPU)

* WIP

* Be able to use merged q, k, v

* Be able to use merged up/gate experts

* Fuse the clamp (CUDA mmvq)
This commit is contained in:
Kawrakow
2026-02-05 08:13:22 +02:00
committed by GitHub
parent 8d952ff183
commit 9c1c74acda
22 changed files with 487 additions and 69 deletions

View File

@@ -71,6 +71,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_STEP35, "step35" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -128,6 +129,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
{ LLM_KV_SWIGLU_LIMITS, "%s.swiglu_limits" },
{ LLM_KV_SWIGLU_LIMITS_SHARED, "%s.swiglu_limits_shared" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -151,8 +154,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,"%s.rope.dimension_count_per_layer" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_PER_LAYER, "%s.rope.freq_base_per_layer" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },